Background

Open-Meteo maintains an API for historical weather that allows for non-commercial usage of historical weather data maintained by the website.

This file builds on _v001 to run exploratory analysis on some historical weather data.

Functions and Libraries

The exploration process uses tidyverse, several generic custom functions, and several functions specific to Open Meteo processing. First, tidyverse and the generic functions are loaded:

library(tidyverse) # tidyverse functionality is included throughout
## Warning: package 'ggplot2' was built under R version 4.2.3
## Warning: package 'tibble' was built under R version 4.2.3
## Warning: package 'purrr' was built under R version 4.2.3
## Warning: package 'dplyr' was built under R version 4.2.3
## Warning: package 'stringr' was built under R version 4.2.3
## Warning: package 'lubridate' was built under R version 4.2.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
source("./Generic_Added_Utility_Functions_202105_v001.R") # Basic functions

Next, specific functions written in _v001 are copied:

# Helper function for reading a partial CSV file
partialCSVRead <- function(loc, firstRow=1L, lastRow=+Inf, col_names=TRUE, ...) {
    
    # FUNCTION arguments
    # loc: file location
    # firstRow: first row that is relevant to the partial file read (whether header line or data line)
    # last Row: last row that is relevant to the partial file read (+Inf means read until last line of file)
    # col_names: the col_names parameter passed to readr::read_csv
    #            TRUE means header=TRUE (get column names from file, read data starting on next line)
    #            FALSE means header=FALSE (auto-generate column names, read data starting on first line)
    #            character vector means use these as column names (read data starting on first line)
    # ...: additional arguments passed to read_csv

    # Read the file and return
    # skip: rows to be skipped are all those prior to firstRow
    # n_max: maximum rows read are lastRow-firstRow, with an additional data row when col_names is not TRUE
    readr::read_csv(loc, 
                    col_names=col_names,
                    skip=firstRow-1, 
                    n_max=lastRow-firstRow+ifelse(isTRUE(col_names), 0, 1), 
                    ...
                    )
    
}


# Get the break points for gaps in a vector (e.g., 0, 3, 5:8, 20 has break points 0, 3, 5, 20 and 0, 3, 8, 30)
vecGaps <- function(x, addElements=c(), sortUnique=TRUE) {
    
    if(length(addElements)>0) x <- c(addElements, x)
    if(isTRUE(sortUnique)) x <- unique(sort(x))
    list("starts"=c(x[is.na(lag(x)) | x-lag(x)>1], +Inf), 
         "ends"=x[is.na(lead(x)) | lead(x)-x>1]
         )
    
}


# Find the break points in a single file
flatFileGaps <- function(loc) {

    which(stringr::str_length(readLines(loc))==0) %>% vecGaps(addElements=0)
    
}


# Read all relevant data as CSV with header
readMultiCSV <- function(loc, col_names=TRUE, ...) {

    gaps <- flatFileGaps(loc)
    
    lapply(seq_along(gaps$ends), 
           FUN=function(x) partialCSVRead(loc, 
                                          firstRow=gaps$ends[x]+1, 
                                          lastRow=gaps$starts[x+1]-1, 
                                          col_names=col_names, 
                                          ...
                                          )
           )
    
}


# Create URL with specified parameters for downloading data from Open Meteo
openMeteoURLCreate <- function(mainURL="https://archive-api.open-meteo.com/v1/archive", 
                               lat=45, 
                               lon=-90, 
                               startDate=paste(year(Sys.Date())-1, "01", "01", sep="-"), 
                               endDate=paste(year(Sys.Date())-1, "12", "31", sep="-"), 
                               hourlyMetrics=NULL, 
                               dailyMetrics=NULL,
                               tz="GMT", 
                               ...
                               ) {
    
    # Create formatted string
    fString <- paste0(mainURL, 
                      "?latitude=", 
                      lat, 
                      "&longitude=", 
                      lon, 
                      "&start_date=", 
                      startDate, 
                      "&end_date=", 
                      endDate
                      )
    if(!is.null(hourlyMetrics)) fString <- paste0(fString, "&hourly=", hourlyMetrics)
    if(!is.null(dailyMetrics)) fString <- paste0(fString, "&daily=", dailyMetrics)
    
    # Return the formatted string
    paste0(fString, "&timezone=", stringr::str_replace(tz, "/", "%2F"), ...)
    
}


# Helper function to simplify entry of parameters for Open Meteo download requests
helperOpenMeteoURL <- function(cityName=NULL,
                               lat=NULL,
                               lon=NULL,
                               hourlyMetrics=NULL,
                               hourlyIndices=NULL,
                               hourlyDesc=tblMetricsHourly,
                               dailyMetrics=NULL,
                               dailyIndices=NULL,
                               dailyDesc=tblMetricsDaily,
                               startDate=NULL, 
                               endDate=NULL, 
                               tz=NULL,
                               ...
                               ) {
    
    # Convert city to lat/lon if lat/lon are NULL
    if(is.null(lat) | is.null(lon)) {
        if(is.null(cityName)) stop("\nMust provide lat/lon or city name available in maps::us.cities\n")
        cityData <- maps::us.cities %>% tibble::as_tibble() %>% filter(name==cityName)
        if(nrow(cityData)!=1) stop("\nMust provide city name that maps uniquely to maps::us.cities$name\n")
        lat <- cityData$lat[1]
        lon <- cityData$long[1]
    }
    
    # Get hourly metrics by index if relevant
    if(is.null(hourlyMetrics) & !is.null(hourlyIndices)) {
        hourlyMetrics <- hourlyDesc %>% slice(hourlyIndices) %>% pull(metric)
        hourlyMetrics <- paste0(hourlyMetrics, collapse=",")
        cat("\nHourly metrics created from indices:", hourlyMetrics, "\n\n")
    }
    
    # Get daily metrics by index if relevant
    if(is.null(dailyMetrics) & !is.null(dailyIndices)) {
        dailyMetrics <- dailyDesc %>% slice(dailyIndices) %>% pull(metric)
        dailyMetrics <- paste0(dailyMetrics, collapse=",")
        cat("\nDaily metrics created from indices:", dailyMetrics, "\n\n")
    }
    
    # Use default values from OpenMeteoURLCreate() for startDate, endDate, and tz if passed as NULL
    if(is.null(startDate)) startDate <- eval(formals(openMeteoURLCreate)$startDate)
    if(is.null(endDate)) endDate <- eval(formals(openMeteoURLCreate)$endDate)
    if(is.null(tz)) tz <- eval(formals(openMeteoURLCreate)$tz)
    
    # Create and return URL
    openMeteoURLCreate(lat=lat,
                       lon=lon, 
                       startDate=startDate, 
                       endDate=endDate, 
                       hourlyMetrics=hourlyMetrics, 
                       dailyMetrics=dailyMetrics, 
                       tz=tz,
                       ...
                       )
    
}


# Read JSON data returned from Open Meteo
readOpenMeteoJSON <- function(js, mapDaily=tblMetricsDaily, mapHourly=tblMetricsHourly) {
    
    # FUNCTION arguments: 
    # js: JSON list returned by download from Open-Meteo
    # mapDaily: mapping file for daily metrics
    # mapHourly: mapping file for hourly metrics
    
    # Get the object and names
    jsObj <- jsonlite::read_json(js, simplifyVector = TRUE)
    nms <- jsObj %>% names()
    cat("\nObjects in JSON include:", paste(nms, collapse=", "), "\n\n")
    
    # Set default objects as NULL
    tblDaily <- NULL
    tblHourly <- NULL
    tblUnitsDaily <- NULL
    tblUnitsHourly <- NULL
    
    # Get daily and hourly as tibble if relevant
    if("daily" %in% nms) tblDaily <- jsObj$daily %>% tibble::as_tibble() %>% omProcessDaily()
    if("hourly" %in% nms) tblHourly <- jsObj$hourly %>% tibble::as_tibble() %>% omProcessHourly()
    
    # Helper function for unit conversions
    helperMetricUnit <- function(x, mapper, desc=NULL) {
        if(is.null(desc)) 
            desc <- as.list(match.call())$x %>% 
                deparse() %>% 
                stringr::str_replace_all(pattern=".*\\$", replacement="")
        x %>% 
            tibble::as_tibble() %>% 
            pivot_longer(cols=everything()) %>% 
            left_join(mapper, by=c("name"="metric")) %>% 
            mutate(value=stringr::str_replace(value, "\u00b0", "deg ")) %>% 
            mutate(metricType=desc) %>% 
            select(metricType, everything())
    }
    
    # Get the unit descriptions
    if("daily_units" %in% nms) tblUnitsDaily <- helperMetricUnit(jsObj$daily_units, mapDaily)
    if("hourly_units" %in% nms) tblUnitsHourly <- helperMetricUnit(jsObj$hourly_units, mapHourly)
    if(is.null(tblUnitsDaily) & !is.null(tblUnitsHourly)) tblUnits <- tblUnitsHourly
    else if(!is.null(tblUnitsDaily) & is.null(tblUnitsHourly)) tblUnits <- tblUnitsDaily
    else if(!is.null(tblUnitsDaily) & !is.null(tblUnitsHourly)) 
        tblUnits <- bind_rows(tblUnitsHourly, tblUnitsDaily)
    else tblUnits <- NULL
    
    # Put everything else together
    tblDescription <- jsObj[setdiff(nms, c("hourly", "hourly_units", "daily", "daily_units"))] %>%
        tibble::as_tibble()
    
    # Return the list objects
    list(tblDaily=tblDaily, tblHourly=tblHourly, tblUnits=tblUnits, tblDescription=tblDescription)
    
}


# Return Open meteo metadata in prettified format
prettyOpenMeteoMeta <- function(df, extr="tblDescription") {
    if("list" %in% class(df)) df <- df[[extr]]
    for(name in names(df)) {
        cat("\n", name, ": ", df %>% pull(name), sep="")
    }
    cat("\n\n")
}


# Process Open Meteo daily data
omProcessDaily <- function(tbl, extr="tblDaily") {
    if("list" %in% class(tbl)) tbl <- tbl[[extr]]
    tbl %>% mutate(date=lubridate::ymd(time)) %>% select(date, everything())
}


# Process Open meteo hourly data
omProcessHourly <- function(tbl, extr="tblHourly") {
    if("list" %in% class(tbl)) tbl <- tbl[[extr]]
    tbl %>% 
        mutate(origTime=time, 
               time=lubridate::ymd_hm(time), 
               date=lubridate::date(time), 
               hour=lubridate::hour(time)
               ) %>% 
        select(time, date, hour, everything())
}


# Simple predictive model for categorical variable
simpleOneVarPredict <- function(df, 
                                tgt, 
                                prd, 
                                dfTest=NULL,
                                nPrint=30, 
                                showPlot=TRUE, 
                                returnData=TRUE
                                ) {
    
    # FUNCTION ARGUMENTS:
    # df: data frame or tibble with key elements (training data set)
    # tgt: target variable
    # prd: predictor variable
    # dfTest: test dataset for applying predictions
    # nPrint: maximum number of lines of confusion matrix to print
    #         0 means do not print any summary statistics
    # showPlot: boolean, should overlap plot be created and shown?
    
    # Counts of predictor to target variable
    dfPred <- df %>%
        group_by(across(all_of(c(prd, tgt)))) %>%
        summarize(n=n(), .groups="drop") %>%
        arrange(across(all_of(prd)), desc(n)) %>%
        group_by(across(all_of(prd))) %>%
        mutate(correct=row_number()==1, predicted=first(get(tgt))) %>%
        ungroup()

    # Confusion matrix and accuracy
    dfConf <- dfPred %>%
        group_by(across(all_of(c(tgt, "correct")))) %>%
        summarize(n=sum(n), .groups="drop") %>%
        pivot_wider(id_cols=tgt, names_from=correct, values_from=n, values_fill=0) %>%
        mutate(n=`TRUE`+`FALSE`, 
               pctCorrect=`TRUE`/n, 
               pctNaive=1/(nrow(.)), 
               lift=pctCorrect/pctNaive-1
               )
    
    # Overall confusion matrix
    dfConfAll <- dfConf %>%
        summarize(nMax=max(n), across(c(`FALSE`, `TRUE`, "n"), sum)) %>%
        mutate(pctCorrect=`TRUE`/n, 
               pctNaive=nMax/n, 
               lift=pctCorrect/pctNaive-1, 
               nBucket=length(unique(dfPred[[prd]]))
               )
    
    # Print confusion matrices
    if(nPrint > 0) {
        cat("\nAccuracy by target subgroup (training data):\n")
        dfConf %>% print(n=nPrint)
        cat("\nOverall Accuracy (training data):\n")
        dfConfAll %>% print(n=nPrint)
    }
    
    # Plot of overlaps
    if(isTRUE(showPlot)) {
        p1 <- dfPred %>%
            group_by(across(c(all_of(tgt), "predicted", "correct"))) %>%
            summarize(n=sum(n), .groups="drop") %>%
            ggplot(aes(x=get(tgt), y=predicted)) + 
            labs(x="Actual", 
                 y="Predicted", 
                 title=paste0("Training data - Actual vs. predicted ", tgt), 
                 subtitle=paste0("(using ", prd, ")")
                 ) + 
            geom_text(aes(label=n)) + 
            geom_tile(aes(fill=correct), alpha=0.25)
        print(p1)
    }
    
    # Create metrics for test dataset if requested
    if(!is.null(dfTest)) {
        # Get maximum category from training data
        mostPredicted <- count(dfPred, predicted, wt=n) %>% slice(1) %>% pull(predicted)
        # Get mapping of metric to prediction
        dfPredict <- dfPred %>% 
            group_by(across(all_of(c(prd, "predicted")))) %>% 
            summarize(n=sum(n), .groups="drop")
        # Create predictions for test data
        dfPredTest <- dfTest %>%
            select(all_of(c(prd, tgt))) %>%
            left_join(select(dfPredict, -n)) %>%
            replace_na(list(predicted=mostPredicted)) %>%
            group_by(across(all_of(c(prd, tgt, "predicted")))) %>%
            summarize(n=n(), .groups="drop") %>%
            mutate(correct=(get(tgt)==predicted))
        # Create confusion statistics for test data
        dfConfTest <- dfPredTest %>%
            group_by(across(all_of(c(tgt, "correct")))) %>%
            summarize(n=sum(n), .groups="drop") %>%
            pivot_wider(id_cols=tgt, names_from=correct, values_from=n, values_fill=0) %>%
            mutate(n=`TRUE`+`FALSE`, 
                   pctCorrect=`TRUE`/n, 
                   pctNaive=1/(nrow(.)), 
                   lift=pctCorrect/pctNaive-1
                   )
        # Overall confusion matrix for test data
        dfConfAllTest <- dfConfTest %>%
            summarize(nMax=max(n), across(c(`FALSE`, `TRUE`, "n"), sum)) %>%
            mutate(pctCorrect=`TRUE`/n, 
                   pctNaive=nMax/n, 
                   lift=pctCorrect/pctNaive-1, 
                   nBucket=length(unique(dfConfTest[[prd]]))
               )
        # Print confusion matrices
        if(nPrint > 0) {
            cat("\nAccuracy by target subgroup (testing data):\n")
            dfConfTest %>% print(n=nPrint)
            cat("\nOverall Accuracy (testing data):\n")
            dfConfAllTest %>% print(n=nPrint)
            }
    } else {
        dfPredTest <- NULL
        dfConfTest <- NULL
        dfConfAllTest <- NULL
        
    }
    
    # Return data if requested
    if(isTRUE(returnData)) list(dfPred=dfPred, 
                                dfConf=dfConf, 
                                dfConfAll=dfConfAll, 
                                dfPredTest=dfPredTest, 
                                dfConfTest=dfConfTest, 
                                dfConfAllTest=dfConfAllTest
                                )
    
}


# Fit a single predictor to a single categorical variable
simpleOneVarFit <- function(df, 
                            tgt, 
                            prd, 
                            rankType="last", 
                            naMethod=TRUE
                            ) {
    
    # FUNCTION ARGUMENTS:
    # df: data frame or tibble with key elements (training data set)
    # tgt: target variable
    # prd: predictor variable
    # rankType: method for breaking ties of same n, passed to base::rank as ties.method=
    # naMethod: method for handling NA in ranks, passed to base::rank as na.last=
    
    # Counts of predictor to target variable, and associated predictions
    df %>%
        group_by(across(all_of(c(prd, tgt)))) %>%
        summarize(n=n(), .groups="drop") %>%
        arrange(across(all_of(prd)), desc(n), across(all_of(tgt))) %>%
        group_by(across(all_of(prd))) %>%
        mutate(rankN=n()+1-rank(n, ties.method=rankType, na.last=naMethod)) %>%
        arrange(across(all_of(prd)), rankN) %>%
        ungroup()

}


# Create categorical predictions mapper
simpleOneVarMapper <- function(df, tgt, prd) {
    
    # FUNCTION ARGUMENTS:
    # df: data frame or tibble from SimpleOneVarFit()
    # tgt: target variable
    # prd: predictor variable
    
    # Get the most common actual results
    dfCommon <- df %>% count(across(all_of(tgt)), wt=n, sort=TRUE)
    
    # Get the predictions
    dfPredictor <- df %>%
        group_by(across(all_of(prd))) %>%
        filter(row_number()==1) %>%
        select(all_of(c(prd, tgt))) %>%
        ungroup()
    
    list(dfPredictor=dfPredictor, dfCommon=dfCommon)
    
}


# Map the categorical predictions to unseen data
simpleOneVarApplyMapper <- function(df, 
                                    tgt,
                                    prd, 
                                    mapper, 
                                    mapperDF="dfPredictor", 
                                    mapperDefault="dfCommon",
                                    prdName="predicted"
                                    ) {
    
    # FUNCTION ARGUMENTS:
    # df: data frame containing prd for predicting tgt
    # tgt: target variable in df
    # prd: predictor variable in df
    # mapper: mapping list from sinpleOneVarMapper()
    # mapperDF: element that can be used to merge mappings
    # mapperDefault: element that can be used for NA resulting from merging mapperDF
    # prdName: name for the prediction variable
    
    # Extract the mapper and default value
    vecRename <- c(prdName) %>% purrr::set_names(tgt)
    dfMap <- mapper[[mapperDF]] %>% select(all_of(c(prd, tgt))) %>% colRenamer(vecRename=vecRename)
    chrDefault <- mapper[[mapperDefault]] %>% slice(1) %>% pull(tgt)
    
    # Merge mappings to df
    df %>%
        left_join(dfMap, by=prd) %>%
        replace_na(list("predicted"=chrDefault))
    
}


# Create confusion matrix data for categorical predictions
simpleOneVarConfusionData <- function(df, 
                                      tgtOrig,
                                      tgtPred, 
                                      otherVars=c(),
                                      weightBy="n"
                                      ) {
    
    # FUNCTION ARGUMENTS:
    # df: data frame from simpleOneVarApplyMapper()
    # tgtOrig: original target variable name in df
    # tgtPred: predicted target variable name in df
    # otherVars: other variables to be kept (will be grouping variables)
    # weightBy: weighting variable for counts in df (NULL means count each row of df as 1)
    
    # Confusion matrix data creation
    df %>%
        group_by(across(all_of(c(tgtOrig, tgtPred, otherVars)))) %>%
        summarize(n=if(!is.null(weightBy)) sum(get(weightBy)) else n(), .groups="drop") %>%
        mutate(correct=get(tgtOrig)==get(tgtPred))
    
}


# Print and plot confusion matrix for categorical predictions
simpleOneVarConfusionReport <- function(df, 
                                        tgtOrig,
                                        tgtPred, 
                                        otherVars=c(), 
                                        printConf=TRUE,
                                        printConfOrig=printConf, 
                                        printConfPred=printConf,
                                        printConfOverall=printConf, 
                                        plotConf=TRUE, 
                                        plotDesc="",
                                        nBucket=NA, 
                                        predictorVarName="", 
                                        returnData=FALSE
                                        ) {
    
    # FUNCTION ARGUMENTS:
    # df: data frame from simpleOneVarConfusionData()
    # tgtOrig: original target variable name in df
    # tgtPred: predicted target variable name in df
    # otherVars: other variables to be kept (will be grouping variables) - NOT IMPLEMENTED
    # printConf: boolean, should confusion matrix data be printed? Applies to all three
    # printConfOrig: boolean, should confusion data be printed based on original target variable?
    # printConfPred: boolean, should confusion data be printed based on predicted target variable?
    # printConfOverall: boolean, should overall confusion data be printed?
    # plotConf: boolean, should confusion overlap data be plotted?
    # plotDesc: descriptive label to be included in front of plot title
    # nBucket: number of buckets used for prediction (pass from previous data)
    # predictorVarName: variable name to be included in chart description
    # returnData: boolean, should the confusion matrices be returned?
    
    # Confusion data based on original target variable
    if(isTRUE(printConfOrig) | isTRUE(returnData)) {
        dfConfOrig <- df %>%
            group_by(across(all_of(c(tgtOrig)))) %>%
            summarize(right=sum(n*correct), wrong=sum(n)-right, n=sum(n), .groups="drop") %>%
            mutate(pctRight=right/n, pctNaive=n/(sum(n)), lift=pctRight/pctNaive-1)
    }

    # Confusion data based on predicted target variable
    if(isTRUE(printConfPred) | isTRUE(returnData)) {
        dfConfPred <- df %>%
            group_by(across(all_of(c(tgtPred)))) %>%
            summarize(right=sum(n*correct), wrong=sum(n)-right, n=sum(n), .groups="drop") %>%
            mutate(pctRight=right/n)
    }

    # Overall confusion data
    if(isTRUE(printConfOverall) | isTRUE(returnData)) {
        maxNaive <- df %>%
            group_by(across(all_of(tgtOrig))) %>%
            summarize(n=sum(n), .groups="drop") %>%
            arrange(desc(n)) %>%
            slice(1) %>%
            pull(n)
        dfConfOverall <- df %>%
            summarize(right=sum(n*correct), wrong=sum(n)-right, n=sum(n), .groups="drop") %>%
            mutate(maxN=maxNaive, pctRight=right/n, pctNaive=maxN/n, lift=pctRight/pctNaive-1, nBucket=nBucket)
    }
    
    # Confusion report based on original target variable
    if(isTRUE(printConfOrig)) {
        cat("\nConfusion data based on original target variable:", tgtOrig, "\n")
        dfConfOrig %>%
            print(n=50)
    }

    # Confusion report based on predicted target variable
    if(isTRUE(printConfPred)) {
        cat("\nConfusion data based on predicted target variable:", tgtPred, "\n")
        dfConfPred %>%
            print(n=50)
    }
    
    # Overall confusion matrix
    if(isTRUE(printConfOverall)) {
        cat("\nOverall confusion matrix\n")
        dfConfOverall %>%
            print(n=50)
    }
    
    # Plot of overlaps
    if(isTRUE(plotConf)) {
        p1 <- df %>%
            group_by(across(all_of(c(tgtOrig, tgtPred, "correct")))) %>%
            summarize(n=sum(n), .groups="drop") %>%
            ggplot(aes(x=get(tgtOrig), y=get(tgtPred))) + 
            labs(x="Actual", 
                 y="Predicted", 
                 title=paste0(plotDesc, "Actual vs. predicted ", tgtOrig), 
                 subtitle=paste0("(using ", predictorVarName, ")")
                 ) + 
            geom_text(aes(label=n)) + 
            geom_tile(aes(fill=correct), alpha=0.25)
        print(p1)
    }
    
    # Return data if requested
    if(isTRUE(returnData)) list(dfConfOrig=dfConfOrig, dfConfPred=dfConfPred, dfConfOverall=dfConfOverall)
    
}


# Process for chaining predictor, applier, and confusion matrix for categorical variables
simpleOneVarChain <- function(df,
                              tgt,
                              prd,
                              mapper=NULL, 
                              rankType="last", 
                              naMethod=TRUE, 
                              printReport=TRUE, 
                              plotDesc="",
                              returnData=TRUE, 
                              includeConfData=FALSE
                              ) {

    # FUNCTION ARGUMENTS:
    # df: data frame or tibble with key elements (training or testing data set)
    # tgt: target variable
    # prd: predictor variable
    # mapper: mapping file to be applied for predictions (NULL means create from simpleOneVarApply())
    # rankType: method for breaking ties of same n, passed to base::rank as ties.method=
    # naMethod: method for handling NA in ranks, passed to base::rank as na.last=    
    # printReport: boolean, should the confusion report data and plot be printed?
    # plotDesc: descriptive label to be included in front of plot title
    # returnData: boolean, should data elements be returned?
    # includeConfData: boolean, should confusion data be returned?
    
    # Create the summary of predictor-target-n
    dfFit <- simpleOneVarFit(df, tgt=tgt, prd=prd, rankType=rankType, naMethod=naMethod)     

    # Create the mapper if it does not already exist
    if(is.null(mapper)) mapper <- simpleOneVarMapper(dfFit, tgt=tgt, prd=prd)
    
    # Apply mapper to data
    dfApplied <- simpleOneVarApplyMapper(dfFit, tgt=tgt, prd=prd, mapper=mapper)

    # Create confusion data
    dfConfusion <- simpleOneVarConfusionData(dfApplied, tgtOrig=tgt, tgtPred="predicted")
    
    # Create confusion report if requested
    if(isTRUE(printReport) | isTRUE(includeConfData)) {
        dfConfReport <- simpleOneVarConfusionReport(df=dfConfusion, 
                                                    tgtOrig=tgt, 
                                                    tgtPred="predicted", 
                                                    nBucket=length(unique(dfApplied[[prd]])), 
                                                    predictorVarName=prd, 
                                                    printConf=printReport, 
                                                    plotConf=printReport,
                                                    plotDesc=plotDesc,
                                                    returnData=includeConfData
                                                    )
    }
    
    # Return data if requested
    if(isTRUE(returnData)) {
        ret <- list(dfFit=dfFit, mapper=mapper, dfApplied=dfApplied, dfConfusion=dfConfusion)
        if(isTRUE(includeConfData)) ret<-c(ret, list(dfConfData=dfConfReport))
        ret
    }
    
}


# Adds a train-test component for single variable predictions
simpleOneVarTrainTest <- function(dfTrain,
                                  dfTest,
                                  tgt,
                                  prd,
                                  rankType="last", 
                                  naMethod=TRUE, 
                                  printReport=FALSE, 
                                  includeConfData=TRUE, 
                                  returnData=TRUE
                              ) {

    # FUNCTION ARGUMENTS:
    # dfTrain: data frame or tibble with key elements (training data set)
    # dfTest: data frame or tibble with key elements (testing data set)
    # tgt: target variable
    # prd: predictor variable
    # rankType: method for breaking ties of same n, passed to base::rank as ties.method=
    # naMethod: method for handling NA in ranks, passed to base::rank as na.last=    
    # printReport: boolean, should the confusion report data and plot be printed?
    # includeConfData: boolean, should confusion data be returned?
    # returnData: boolean, should data elements be returned?
    
    # Fit the training data
    tmpTrain <- simpleOneVarChain(df=dfTrain, 
                                  tgt=tgt, 
                                  prd=prd,
                                  rankType=rankType,
                                  naMethod=naMethod,
                                  printReport=printReport,
                                  plotDesc="Training data: ",
                                  returnData=TRUE,
                                  includeConfData=includeConfData
                                  )
    
    # Fit the testing data
    tmpTest <- simpleOneVarChain(df=dfTest, 
                                 tgt=tgt, 
                                 prd=prd,
                                 mapper=tmpTrain$mapper,
                                 rankType=rankType,
                                 naMethod=naMethod,
                                 printReport=printReport,
                                 plotDesc="Testing data: ",
                                 returnData=TRUE,
                                 includeConfData=includeConfData
                                 )
    
    # Return data if requested
    if(isTRUE(returnData)) list(tmpTrain=tmpTrain, tmpTest=tmpTest)
    
}


# Plot the means by cluster and variable for a k-means object
plotClusterMeans <- function(km, nrow=NULL, ncol=NULL, scales="fixed") {

    # FUNCTION ARGUMENTS
    # km: object returned by stats::kmeans(...)
    # nrow: number of rows for faceting (NULL means default)
    # ncol: number of columns for faceting (NULL means default)
    # scales: passed to facet_wrap as scales=scales
    
    # Assess clustering by dimension
    p1 <- km$centers %>%
        tibble::as_tibble() %>%
        mutate(cluster=row_number()) %>%
        pivot_longer(cols=-c(cluster)) %>%
        ggplot(aes(x=fct_reorder(name, 
                                 value, 
                                 .fun=function(a) ifelse(length(a)==2, a[2]-a[1], diff(range(a)))
                                 ), 
                   y=value
                   )
               ) + 
        geom_point(aes(color=factor(cluster))) + 
        scale_color_discrete("Cluster") + 
        facet_wrap(~factor(cluster), nrow=nrow, ncol=ncol, scales=scales) +
        labs(title=paste0("Cluster means (kmeans, centers=", nrow(km$centers), ")"), 
             x="Metric", 
             y="Cluster mean"
             ) + 
        geom_hline(yintercept=median(km$centers), lty=2) +
        coord_flip()
    print(p1)
    
}


# Plot percentage by cluster
plotClusterPct <- function(df, km, keyVars, nRowFacet=1, printPlot=TRUE) {
    
    # FUNCTION ARGUMENTS:
    # df: data frame initially passed to stats::kmeans(...)
    # km: object returned by stats::kmeans(...)
    # keyVars: character vector of length 1 (y-only, x will be cl) or length 2 (x, y, cl will facet)
    # nRowFacet: number of rows for facetting (only relevant if length(keyVars) is 2)
    # printPlot: boolean, should plot be printed? (if not true, plot will be returned)
    
    # Check length of keyVars
    if(!(length(keyVars) %in% c(1, 2))) stop("\nArgument keyVars must be length-1 or length-2\n")
    
    p1 <- df %>%
        mutate(cl=factor(km$cluster)) %>%
        group_by(across(c(all_of(keyVars), "cl"))) %>%
        summarize(n=n(), .groups="drop") %>%
        group_by(across(all_of(keyVars))) %>%
        mutate(pct=n/sum(n)) %>%
        ungroup() %>%
        ggplot() + 
        scale_fill_continuous(low="white", high="green") + 
        labs(title=paste0("Percentage by cluster (kmeans with ", nrow(km$centers), " centers)"), 
             x=ifelse(length(keyVars)==1, "Cluster", keyVars[1]), 
             y=ifelse(length(keyVars)==1, keyVars[1], keyVars[2])
             )
    if(length(keyVars)==1) p1 <- p1 + geom_tile(aes(fill=pct, x=cl, y=get(keyVars[1])))
    if(length(keyVars)==2) {
        p1 <- p1 + 
            geom_tile(aes(fill=pct, x=get(keyVars[1]), y=get(keyVars[2]))) + 
            facet_wrap(~cl, nrow=nRowFacet)
    }
    
    if(isTRUE(printPlot)) print(p1)
    else return(p1)
    
}


# Run k-means (or use passed k-means object) and plot centers and percentages of observations
runKMeans <- function(df, 
                      km=NULL,
                      vars=NULL, 
                      centers=2, 
                      nStart=1L, 
                      iter.max=10L, 
                      seed=NULL, 
                      plotMeans=FALSE,
                      nrowMeans=NULL,
                      plotPct=NULL, 
                      nrowPct=1, 
                      returnKM=is.null(km)
                      ) {
    
    # FUNCTION ARGUMENTS:
    # df: data frame for clustering
    # km: k-means object (will shut off k-means processing and run as plot-only)
    # vars: variables to be used for clustering (NULL means everything in df)
    # centers: number of centers
    # nStart: passed to kmeans
    # iter.max: passed to kmeans
    # seed: seed to be set (if NULL, no seed is set)
    # plotMeans: boolean, plot variable means by cluster?
    # nrowMeans: argument passed as nrow for faceting rows in plotClusterMeans() - NULL is default ggplot2
    # plotPct: list of character vectors to be passed sequentially as keyVars to plotClusterPct()
    #          NULL means do not run
    #          pctByCluster=list(c("var1"), c("var2", "var3")) will run plotting twice
    # nrowPct: argument for faceting number of rows in plotClusterPct()
    # returnKM: boolean, should the k-means object be returned?
    
    # Set seed if requested
    if(!is.null(seed)) set.seed(seed)
    
    # Get the variable names if passed as NULL
    if(is.null(vars)) vars <- names(df)
    
    # Run the k-means process if the object has not been passed
    if(is.null(km)) {
        km <- df %>%
            select(all_of(vars)) %>% 
            kmeans(centers=centers, iter.max=iter.max, nstart=nStart)
    }

    # Assess clustering by dimension if requested
    if(isTRUE(plotMeans)) plotClusterMeans(km, nrow=nrowMeans)
    if(!is.null((plotPct))) 
        for(ctr in 1:length(plotPct)) 
            plotClusterPct(df=df, km=km, keyVars=plotPct[[ctr]], nRowFacet=nrowPct)
    
    # Return the k-means object
    if(isTRUE(returnKM)) return(km)
    
}


# Assign points to closest center of a passed k-means object
assignKMeans <- function(km, df, returnAllDistanceData=FALSE) {
    
    # FUNCTION ARGUMENTS:
    # km: a k-means object
    # df: data frame or tibble
    # returnAllDistanceData: boolean, should the distance data and clusters be returned?
    #                        TRUE returns a data frame with distances as V1, V2, ..., and cluster as cl
    #                        FALSE returns a vector of cluster assignments as integers
    
    # Select columns from df to match km
    df <- df %>% select(all_of(colnames(km$centers)))
    if(!all.equal(names(df), colnames(km$centers))) stop("\nName mismatch in clustering and frame\n")
    
    # Create the distances and find clusters
    distClust <- sapply(seq_len(nrow(km$centers)), 
                        FUN=function(x) sqrt(rowSums(sweep(as.matrix(df), 
                                                           2, 
                                                           t(as.matrix(km$centers[x,,drop=FALSE]))
                                                           )**2
                                                     )
                                             )
                        ) %>% 
        as.data.frame() %>% 
        tibble::as_tibble() %>% 
        mutate(cl=apply(., 1, which.min))
    
    # Return the proper file
    if(isTRUE(returnAllDistanceData)) return(distClust)
    else return(distClust$cl)
    
}

Key mapping tables for available metrics are also copied:

hourlyMetrics <- "temperature_2m,relativehumidity_2m,dewpoint_2m,apparent_temperature,pressure_msl,surface_pressure,precipitation,rain,snowfall,cloudcover,cloudcover_low,cloudcover_mid,cloudcover_high,shortwave_radiation,direct_radiation,direct_normal_irradiance,diffuse_radiation,windspeed_10m,windspeed_100m,winddirection_10m,winddirection_100m,windgusts_10m,et0_fao_evapotranspiration,weathercode,vapor_pressure_deficit,soil_temperature_0_to_7cm,soil_temperature_7_to_28cm,soil_temperature_28_to_100cm,soil_temperature_100_to_255cm,soil_moisture_0_to_7cm,soil_moisture_7_to_28cm,soil_moisture_28_to_100cm,soil_moisture_100_to_255cm"
dailyMetrics <- "weathercode,temperature_2m_max,temperature_2m_min,apparent_temperature_max,apparent_temperature_min,precipitation_sum,rain_sum,snowfall_sum,precipitation_hours,sunrise,sunset,windspeed_10m_max,windgusts_10m_max,winddirection_10m_dominant,shortwave_radiation_sum,et0_fao_evapotranspiration"

hourlyDescription <- "Air temperature at 2 meters above ground\nRelative humidity at 2 meters above ground\nDew point temperature at 2 meters above ground\nApparent temperature is the perceived feels-like temperature combining wind chill factor, relative humidity and solar radiation\nAtmospheric air pressure reduced to mean sea level (msl) or pressure at surface. Typically pressure on mean sea level is used in meteorology. Surface pressure gets lower with increasing elevation.\nAtmospheric air pressure reduced to mean sea level (msl) or pressure at surface. Typically pressure on mean sea level is used in meteorology. Surface pressure gets lower with increasing elevation.\nTotal precipitation (rain, showers, snow) sum of the preceding hour. Data is stored with a 0.1 mm precision. If precipitation data is summed up to monthly sums, there might be small inconsistencies with the total precipitation amount.\nOnly liquid precipitation of the preceding hour including local showers and rain from large scale systems.\nSnowfall amount of the preceding hour in centimeters. For the water equivalent in millimeter, divide by 7. E.g. 7 cm snow = 10 mm precipitation water equivalent\nTotal cloud cover as an area fraction\nLow level clouds and fog up to 2 km altitude\nMid level clouds from 2 to 6 km altitude\nHigh level clouds from 6 km altitude\nShortwave solar radiation as average of the preceding hour. This is equal to the total global horizontal irradiation\nDirect solar radiation as average of the preceding hour on the horizontal plane and the normal plane (perpendicular to the sun)\nDirect solar radiation as average of the preceding hour on the horizontal plane and the normal plane (perpendicular to the sun)\nDiffuse solar radiation as average of the preceding hour\nWind speed at 10 or 100 meters above ground. Wind speed on 10 meters is the standard level.\nWind speed at 10 or 100 meters above ground. Wind speed on 10 meters is the standard level.\nWind direction at 10 or 100 meters above ground\nWind direction at 10 or 100 meters above ground\nGusts at 10 meters above ground of the indicated hour. Wind gusts in CERRA are defined as the maximum wind gusts of the preceding hour. Please consult the ECMWF IFS documentation for more information on how wind gusts are parameterized in weather models.\nET0 Reference Evapotranspiration of a well watered grass field. Based on FAO-56 Penman-Monteith equations ET0 is calculated from temperature, wind speed, humidity and solar radiation. Unlimited soil water is assumed. ET0 is commonly used to estimate the required irrigation for plants.\nWeather condition as a numeric code. Follow WMO weather interpretation codes. See table below for details. Weather code is calculated from cloud cover analysis, precipitation and snowfall. As barely no information about atmospheric stability is available, estimation about thunderstorms is not possible.\nVapor Pressure Deificit (VPD) in kilopascal (kPa). For high VPD (>1.6), water transpiration of plants increases. For low VPD (<0.4), transpiration decreases\nAverage temperature of different soil levels below ground.\nAverage temperature of different soil levels below ground.\nAverage temperature of different soil levels below ground.\nAverage temperature of different soil levels below ground.\nAverage soil water content as volumetric mixing ratio at 0-7, 7-28, 28-100 and 100-255 cm depths.\nAverage soil water content as volumetric mixing ratio at 0-7, 7-28, 28-100 and 100-255 cm depths.\nAverage soil water content as volumetric mixing ratio at 0-7, 7-28, 28-100 and 100-255 cm depths.\nAverage soil water content as volumetric mixing ratio at 0-7, 7-28, 28-100 and 100-255 cm depths."
dailyDescription <- "The most severe weather condition on a given day\nMaximum and minimum daily air temperature at 2 meters above ground\nMaximum and minimum daily air temperature at 2 meters above ground\nMaximum and minimum daily apparent temperature\nMaximum and minimum daily apparent temperature\nSum of daily precipitation (including rain, showers and snowfall)\nSum of daily rain\nSum of daily snowfall\nThe number of hours with rain\nSun rise and set times\nSun rise and set times\nMaximum wind speed and gusts on a day\nMaximum wind speed and gusts on a day\nDominant wind direction\nThe sum of solar radiaion on a given day in Megajoules\nDaily sum of ET0 Reference Evapotranspiration of a well watered grass field"

# Create tibble for hourly metrics
tblMetricsHourly <- tibble::tibble(metric=hourlyMetrics %>% str_split_1(","), 
                                   description=hourlyDescription %>% str_split_1("\n")
                                   )
tblMetricsHourly %>% 
    print(n=50)
## # A tibble: 33 × 2
##    metric                        description                                    
##    <chr>                         <chr>                                          
##  1 temperature_2m                Air temperature at 2 meters above ground       
##  2 relativehumidity_2m           Relative humidity at 2 meters above ground     
##  3 dewpoint_2m                   Dew point temperature at 2 meters above ground 
##  4 apparent_temperature          Apparent temperature is the perceived feels-li…
##  5 pressure_msl                  Atmospheric air pressure reduced to mean sea l…
##  6 surface_pressure              Atmospheric air pressure reduced to mean sea l…
##  7 precipitation                 Total precipitation (rain, showers, snow) sum …
##  8 rain                          Only liquid precipitation of the preceding hou…
##  9 snowfall                      Snowfall amount of the preceding hour in centi…
## 10 cloudcover                    Total cloud cover as an area fraction          
## 11 cloudcover_low                Low level clouds and fog up to 2 km altitude   
## 12 cloudcover_mid                Mid level clouds from 2 to 6 km altitude       
## 13 cloudcover_high               High level clouds from 6 km altitude           
## 14 shortwave_radiation           Shortwave solar radiation as average of the pr…
## 15 direct_radiation              Direct solar radiation as average of the prece…
## 16 direct_normal_irradiance      Direct solar radiation as average of the prece…
## 17 diffuse_radiation             Diffuse solar radiation as average of the prec…
## 18 windspeed_10m                 Wind speed at 10 or 100 meters above ground. W…
## 19 windspeed_100m                Wind speed at 10 or 100 meters above ground. W…
## 20 winddirection_10m             Wind direction at 10 or 100 meters above ground
## 21 winddirection_100m            Wind direction at 10 or 100 meters above ground
## 22 windgusts_10m                 Gusts at 10 meters above ground of the indicat…
## 23 et0_fao_evapotranspiration    ET0 Reference Evapotranspiration of a well wat…
## 24 weathercode                   Weather condition as a numeric code. Follow WM…
## 25 vapor_pressure_deficit        Vapor Pressure Deificit (VPD) in kilopascal (k…
## 26 soil_temperature_0_to_7cm     Average temperature of different soil levels b…
## 27 soil_temperature_7_to_28cm    Average temperature of different soil levels b…
## 28 soil_temperature_28_to_100cm  Average temperature of different soil levels b…
## 29 soil_temperature_100_to_255cm Average temperature of different soil levels b…
## 30 soil_moisture_0_to_7cm        Average soil water content as volumetric mixin…
## 31 soil_moisture_7_to_28cm       Average soil water content as volumetric mixin…
## 32 soil_moisture_28_to_100cm     Average soil water content as volumetric mixin…
## 33 soil_moisture_100_to_255cm    Average soil water content as volumetric mixin…
# Create tibble for daily metrics
tblMetricsDaily <- tibble::tibble(metric=dailyMetrics %>% str_split_1(","), 
                                  description=dailyDescription %>% str_split_1("\n")
                                   )
tblMetricsDaily
## # A tibble: 16 × 2
##    metric                     description                                       
##    <chr>                      <chr>                                             
##  1 weathercode                The most severe weather condition on a given day  
##  2 temperature_2m_max         Maximum and minimum daily air temperature at 2 me…
##  3 temperature_2m_min         Maximum and minimum daily air temperature at 2 me…
##  4 apparent_temperature_max   Maximum and minimum daily apparent temperature    
##  5 apparent_temperature_min   Maximum and minimum daily apparent temperature    
##  6 precipitation_sum          Sum of daily precipitation (including rain, showe…
##  7 rain_sum                   Sum of daily rain                                 
##  8 snowfall_sum               Sum of daily snowfall                             
##  9 precipitation_hours        The number of hours with rain                     
## 10 sunrise                    Sun rise and set times                            
## 11 sunset                     Sun rise and set times                            
## 12 windspeed_10m_max          Maximum wind speed and gusts on a day             
## 13 windgusts_10m_max          Maximum wind speed and gusts on a day             
## 14 winddirection_10m_dominant Dominant wind direction                           
## 15 shortwave_radiation_sum    The sum of solar radiaion on a given day in Megaj…
## 16 et0_fao_evapotranspiration Daily sum of ET0 Reference Evapotranspiration of …

Daily and hourly data are downloaded for NYc, cached to avoid multiple hits to the server:

# Hourly data download for New York, NY
testURLHourly <- helperOpenMeteoURL(cityName="New York NY", 
                                    hourlyIndices=1:nrow(tblMetricsHourly),
                                    startDate="2010-01-01", 
                                    endDate="2023-06-15", 
                                    tz="America/New_York"
                                    )
## 
## Hourly metrics created from indices: temperature_2m,relativehumidity_2m,dewpoint_2m,apparent_temperature,pressure_msl,surface_pressure,precipitation,rain,snowfall,cloudcover,cloudcover_low,cloudcover_mid,cloudcover_high,shortwave_radiation,direct_radiation,direct_normal_irradiance,diffuse_radiation,windspeed_10m,windspeed_100m,winddirection_10m,winddirection_100m,windgusts_10m,et0_fao_evapotranspiration,weathercode,vapor_pressure_deficit,soil_temperature_0_to_7cm,soil_temperature_7_to_28cm,soil_temperature_28_to_100cm,soil_temperature_100_to_255cm,soil_moisture_0_to_7cm,soil_moisture_7_to_28cm,soil_moisture_28_to_100cm,soil_moisture_100_to_255cm
testURLHourly
## [1] "https://archive-api.open-meteo.com/v1/archive?latitude=40.67&longitude=-73.94&start_date=2010-01-01&end_date=2023-06-15&hourly=temperature_2m,relativehumidity_2m,dewpoint_2m,apparent_temperature,pressure_msl,surface_pressure,precipitation,rain,snowfall,cloudcover,cloudcover_low,cloudcover_mid,cloudcover_high,shortwave_radiation,direct_radiation,direct_normal_irradiance,diffuse_radiation,windspeed_10m,windspeed_100m,winddirection_10m,winddirection_100m,windgusts_10m,et0_fao_evapotranspiration,weathercode,vapor_pressure_deficit,soil_temperature_0_to_7cm,soil_temperature_7_to_28cm,soil_temperature_28_to_100cm,soil_temperature_100_to_255cm,soil_moisture_0_to_7cm,soil_moisture_7_to_28cm,soil_moisture_28_to_100cm,soil_moisture_100_to_255cm&timezone=America%2FNew_York"
# Download file
if(!file.exists("testOM_hourly_nyc.json")) {
    fileDownload(fileName="testOM_hourly_nyc.json", url=testURLHourly)
} else {
    cat("\nFile testOM_hourly_nyc.json already exists, skipping download\n")
}
## 
## File testOM_hourly_nyc.json already exists, skipping download
# Daily data download for New York, NY
testURLDaily <- helperOpenMeteoURL(cityName="New York NY", 
                                   dailyIndices=1:nrow(tblMetricsDaily),
                                   startDate="2010-01-01", 
                                   endDate="2023-06-15", 
                                   tz="America/New_York"
                                   )
## 
## Daily metrics created from indices: weathercode,temperature_2m_max,temperature_2m_min,apparent_temperature_max,apparent_temperature_min,precipitation_sum,rain_sum,snowfall_sum,precipitation_hours,sunrise,sunset,windspeed_10m_max,windgusts_10m_max,winddirection_10m_dominant,shortwave_radiation_sum,et0_fao_evapotranspiration
testURLDaily
## [1] "https://archive-api.open-meteo.com/v1/archive?latitude=40.67&longitude=-73.94&start_date=2010-01-01&end_date=2023-06-15&daily=weathercode,temperature_2m_max,temperature_2m_min,apparent_temperature_max,apparent_temperature_min,precipitation_sum,rain_sum,snowfall_sum,precipitation_hours,sunrise,sunset,windspeed_10m_max,windgusts_10m_max,winddirection_10m_dominant,shortwave_radiation_sum,et0_fao_evapotranspiration&timezone=America%2FNew_York"
# Download file
if(!file.exists("testOM_daily_nyc.json")) {
    fileDownload(fileName="testOM_daily_nyc.json", url=testURLDaily)
} else {
    cat("\nFile testOM_daily_nyc.json already exists, skipping download\n")
}
## 
## File testOM_daily_nyc.json already exists, skipping download

Data are read (process cached):

# Read daily JSON file
nycOMDaily <- readOpenMeteoJSON("testOM_daily_nyc.json")
## 
## Objects in JSON include: latitude, longitude, generationtime_ms, utc_offset_seconds, timezone, timezone_abbreviation, elevation, daily_units, daily
nycOMDaily
## $tblDaily
## # A tibble: 4,914 × 18
##    date       time       weathercode temperature_2m_max temperature_2m_min
##    <date>     <chr>            <int>              <dbl>              <dbl>
##  1 2010-01-01 2010-01-01          73                5                 -1.4
##  2 2010-01-02 2010-01-02          71               -0.6               -9.2
##  3 2010-01-03 2010-01-03          71               -4.8              -10  
##  4 2010-01-04 2010-01-04           1               -0.8               -7.3
##  5 2010-01-05 2010-01-05           1               -0.2               -7.3
##  6 2010-01-06 2010-01-06           2                1.1               -5.3
##  7 2010-01-07 2010-01-07           2                3.6               -3.7
##  8 2010-01-08 2010-01-08          71                1.9               -5.7
##  9 2010-01-09 2010-01-09           0               -1.4               -7.7
## 10 2010-01-10 2010-01-10           0               -1.7              -10.3
## # ℹ 4,904 more rows
## # ℹ 13 more variables: apparent_temperature_max <dbl>,
## #   apparent_temperature_min <dbl>, precipitation_sum <dbl>, rain_sum <dbl>,
## #   snowfall_sum <dbl>, precipitation_hours <dbl>, sunrise <chr>, sunset <chr>,
## #   windspeed_10m_max <dbl>, windgusts_10m_max <dbl>,
## #   winddirection_10m_dominant <int>, shortwave_radiation_sum <dbl>,
## #   et0_fao_evapotranspiration <dbl>
## 
## $tblHourly
## NULL
## 
## $tblUnits
## # A tibble: 17 × 4
##    metricType  name                       value      description                
##    <chr>       <chr>                      <chr>      <chr>                      
##  1 daily_units time                       "iso8601"  <NA>                       
##  2 daily_units weathercode                "wmo code" The most severe weather co…
##  3 daily_units temperature_2m_max         "deg C"    Maximum and minimum daily …
##  4 daily_units temperature_2m_min         "deg C"    Maximum and minimum daily …
##  5 daily_units apparent_temperature_max   "deg C"    Maximum and minimum daily …
##  6 daily_units apparent_temperature_min   "deg C"    Maximum and minimum daily …
##  7 daily_units precipitation_sum          "mm"       Sum of daily precipitation…
##  8 daily_units rain_sum                   "mm"       Sum of daily rain          
##  9 daily_units snowfall_sum               "cm"       Sum of daily snowfall      
## 10 daily_units precipitation_hours        "h"        The number of hours with r…
## 11 daily_units sunrise                    "iso8601"  Sun rise and set times     
## 12 daily_units sunset                     "iso8601"  Sun rise and set times     
## 13 daily_units windspeed_10m_max          "km/h"     Maximum wind speed and gus…
## 14 daily_units windgusts_10m_max          "km/h"     Maximum wind speed and gus…
## 15 daily_units winddirection_10m_dominant "deg "     Dominant wind direction    
## 16 daily_units shortwave_radiation_sum    "MJ/m²"    The sum of solar radiaion …
## 17 daily_units et0_fao_evapotranspiration "mm"       Daily sum of ET0 Reference…
## 
## $tblDescription
## # A tibble: 1 × 7
##   latitude longitude generationtime_ms utc_offset_seconds timezone        
##      <dbl>     <dbl>             <dbl>              <int> <chr>           
## 1     40.7     -73.9              101.             -14400 America/New_York
## # ℹ 2 more variables: timezone_abbreviation <chr>, elevation <dbl>
prettyOpenMeteoMeta(nycOMDaily)
## 
## latitude: 40.7
## longitude: -73.9
## generationtime_ms: 100.914
## utc_offset_seconds: -14400
## timezone: America/New_York
## timezone_abbreviation: EDT
## elevation: 36
# Read hourly JSON file
nycOMHourly <- readOpenMeteoJSON("testOM_hourly_nyc.json")
## 
## Objects in JSON include: latitude, longitude, generationtime_ms, utc_offset_seconds, timezone, timezone_abbreviation, elevation, hourly_units, hourly
nycOMHourly
## $tblDaily
## NULL
## 
## $tblHourly
## # A tibble: 117,936 × 37
##    time                date        hour temperature_2m relativehumidity_2m
##    <dttm>              <date>     <int>          <dbl>               <int>
##  1 2010-01-01 00:00:00 2010-01-01     0           -1.1                  95
##  2 2010-01-01 01:00:00 2010-01-01     1           -1                    96
##  3 2010-01-01 02:00:00 2010-01-01     2           -1                    96
##  4 2010-01-01 03:00:00 2010-01-01     3           -0.8                  97
##  5 2010-01-01 04:00:00 2010-01-01     4           -0.9                  97
##  6 2010-01-01 05:00:00 2010-01-01     5           -0.8                  97
##  7 2010-01-01 06:00:00 2010-01-01     6           -0.7                  97
##  8 2010-01-01 07:00:00 2010-01-01     7           -0.5                  97
##  9 2010-01-01 08:00:00 2010-01-01     8           -0.6                  97
## 10 2010-01-01 09:00:00 2010-01-01     9           -0.6                  97
## # ℹ 117,926 more rows
## # ℹ 32 more variables: dewpoint_2m <dbl>, apparent_temperature <dbl>,
## #   pressure_msl <dbl>, surface_pressure <dbl>, precipitation <dbl>,
## #   rain <dbl>, snowfall <dbl>, cloudcover <int>, cloudcover_low <int>,
## #   cloudcover_mid <int>, cloudcover_high <int>, shortwave_radiation <dbl>,
## #   direct_radiation <dbl>, direct_normal_irradiance <dbl>,
## #   diffuse_radiation <dbl>, windspeed_10m <dbl>, windspeed_100m <dbl>, …
## 
## $tblUnits
## # A tibble: 34 × 4
##    metricType   name                 value   description                        
##    <chr>        <chr>                <chr>   <chr>                              
##  1 hourly_units time                 iso8601 <NA>                               
##  2 hourly_units temperature_2m       deg C   Air temperature at 2 meters above …
##  3 hourly_units relativehumidity_2m  %       Relative humidity at 2 meters abov…
##  4 hourly_units dewpoint_2m          deg C   Dew point temperature at 2 meters …
##  5 hourly_units apparent_temperature deg C   Apparent temperature is the percei…
##  6 hourly_units pressure_msl         hPa     Atmospheric air pressure reduced t…
##  7 hourly_units surface_pressure     hPa     Atmospheric air pressure reduced t…
##  8 hourly_units precipitation        mm      Total precipitation (rain, showers…
##  9 hourly_units rain                 mm      Only liquid precipitation of the p…
## 10 hourly_units snowfall             cm      Snowfall amount of the preceding h…
## # ℹ 24 more rows
## 
## $tblDescription
## # A tibble: 1 × 7
##   latitude longitude generationtime_ms utc_offset_seconds timezone        
##      <dbl>     <dbl>             <dbl>              <int> <chr>           
## 1     40.7     -73.9              118.             -14400 America/New_York
## # ℹ 2 more variables: timezone_abbreviation <chr>, elevation <dbl>
prettyOpenMeteoMeta(nycOMHourly)
## 
## latitude: 40.7
## longitude: -73.9
## generationtime_ms: 118.0021
## utc_offset_seconds: -14400
## timezone: America/New_York
## timezone_abbreviation: EDT
## elevation: 36

Percentiles and other explanatory variables are created:

# Create percentiles for numeric variables
nycTemp <- nycOMHourly$tblHourly %>%
    mutate(month=factor(month.abb[lubridate::month(date)], levels=month.abb), 
           hour=lubridate::hour(time), 
           fct_hour=factor(hour), 
           across(where(is.numeric), .fns=function(x) round(100*percent_rank(x)), .names="pct_{.col}")
           )
nycTemp
## # A tibble: 117,936 × 73
##    time                date        hour temperature_2m relativehumidity_2m
##    <dttm>              <date>     <int>          <dbl>               <int>
##  1 2010-01-01 00:00:00 2010-01-01     0           -1.1                  95
##  2 2010-01-01 01:00:00 2010-01-01     1           -1                    96
##  3 2010-01-01 02:00:00 2010-01-01     2           -1                    96
##  4 2010-01-01 03:00:00 2010-01-01     3           -0.8                  97
##  5 2010-01-01 04:00:00 2010-01-01     4           -0.9                  97
##  6 2010-01-01 05:00:00 2010-01-01     5           -0.8                  97
##  7 2010-01-01 06:00:00 2010-01-01     6           -0.7                  97
##  8 2010-01-01 07:00:00 2010-01-01     7           -0.5                  97
##  9 2010-01-01 08:00:00 2010-01-01     8           -0.6                  97
## 10 2010-01-01 09:00:00 2010-01-01     9           -0.6                  97
## # ℹ 117,926 more rows
## # ℹ 68 more variables: dewpoint_2m <dbl>, apparent_temperature <dbl>,
## #   pressure_msl <dbl>, surface_pressure <dbl>, precipitation <dbl>,
## #   rain <dbl>, snowfall <dbl>, cloudcover <int>, cloudcover_low <int>,
## #   cloudcover_mid <int>, cloudcover_high <int>, shortwave_radiation <dbl>,
## #   direct_radiation <dbl>, direct_normal_irradiance <dbl>,
## #   diffuse_radiation <dbl>, windspeed_10m <dbl>, windspeed_100m <dbl>, …
# Create summary statistics
nycStats <- nycTemp %>%
    select((where(is.numeric))) %>%
    pivot_longer(cols=everything()) %>%
    group_by(name) %>%
    summarize(across(value, .fns=list(min=min, median=median, mean=mean, max=max)))
nycStats %>%
    filter(str_detect(name, "pct")) %>%
    arrange(value_median, value_mean) %>%
    print(n=40)
## # A tibble: 34 × 5
##    name                              value_min value_median value_mean value_max
##    <chr>                                 <dbl>        <dbl>      <dbl>     <dbl>
##  1 pct_snowfall                              0            0       1.94       100
##  2 pct_rain                                  0            0      11.8        100
##  3 pct_precipitation                         0            0      12.7        100
##  4 pct_direct_radiation                      0            0      36.2        100
##  5 pct_direct_normal_irradiance              0            0      36.3        100
##  6 pct_weathercode                           0           36      38.6        100
##  7 pct_cloudcover_low                        0           47      38.4         91
##  8 pct_et0_fao_evapotranspiration            0           48      46.0        100
##  9 pct_hour                                  0           48      48           96
## 10 pct_relativehumidity_2m                   0           48      49.2        100
## 11 pct_windspeed_10m                         0           49      49.7        100
## 12 pct_pressure_msl                          0           49      49.8        100
## 13 pct_diffuse_radiation                     0           50      39.1        100
## 14 pct_shortwave_radiation                   0           50      39.1        100
## 15 pct_cloudcover_high                       0           50      41.1         88
## 16 pct_cloudcover_mid                        0           50      42.9         94
## 17 pct_cloudcover                            0           50      46.5         81
## 18 pct_soil_moisture_100_to_255cm            0           50      49.3        100
## 19 pct_windgusts_10m                         0           50      49.5        100
## 20 pct_vapor_pressure_deficit                0           50      49.5        100
## 21 pct_soil_moisture_28_to_100cm             0           50      49.7        100
## 22 pct_soil_temperature_100_to_255cm         0           50      49.7        100
## 23 pct_soil_moisture_7_to_28cm               0           50      49.7        100
## 24 pct_soil_temperature_28_to_100cm          0           50      49.8        100
## 25 pct_soil_moisture_0_to_7cm                0           50      49.8        100
## 26 pct_soil_temperature_7_to_28cm            0           50      49.8        100
## 27 pct_surface_pressure                      0           50      49.8        100
## 28 pct_winddirection_10m                     0           50      49.8         99
## 29 pct_winddirection_100m                    0           50      49.8        100
## 30 pct_soil_temperature_0_to_7cm             0           50      49.8        100
## 31 pct_windspeed_100m                        0           50      49.8        100
## 32 pct_dewpoint_2m                           0           50      49.9        100
## 33 pct_temperature_2m                        0           50      49.9        100
## 34 pct_apparent_temperature                  0           50      49.9        100
nycStats %>%
    filter(!str_detect(name, "pct")) %>%
    arrange(value_max, value_mean) %>%
    print(n=40)
## # A tibble: 34 × 5
##    name                          value_min value_median value_mean value_max
##    <chr>                             <dbl>        <dbl>      <dbl>     <dbl>
##  1 soil_moisture_100_to_255cm        0.332        0.377    0.375       0.43 
##  2 soil_moisture_28_to_100cm         0.222        0.346    0.338       0.438
##  3 soil_moisture_0_to_7cm            0.085        0.343    0.311       0.439
##  4 soil_moisture_7_to_28cm           0.17         0.352    0.332       0.439
##  5 et0_fao_evapotranspiration        0            0.04     0.108       0.88 
##  6 snowfall                          0            0        0.00795     4.2  
##  7 vapor_pressure_deficit            0            0.31     0.504       5.16 
##  8 rain                              0            0        0.116      19.2  
##  9 precipitation                     0            0        0.127      19.2  
## 10 soil_temperature_100_to_255cm     2.8         12.5     12.8        22.1  
## 11 hour                              0           11.5     11.5        23    
## 12 dewpoint_2m                     -28.9          7.4      6.56       26    
## 13 soil_temperature_28_to_100cm      0.3         12.4     12.8        26    
## 14 soil_temperature_7_to_28cm       -4           12.8     13.0        32.7  
## 15 temperature_2m                  -21.7         12.3     12.1        38.5  
## 16 apparent_temperature            -27           10.1     10.4        43.5  
## 17 soil_temperature_0_to_7cm        -9           12.8     13.1        47.8  
## 18 windspeed_10m                     0           10.4     11.4        60.6  
## 19 weathercode                       0            1        8.92       75    
## 20 windspeed_100m                    0           18.9     19.6        95.5  
## 21 cloudcover_low                    0            1       24.7       100    
## 22 cloudcover_mid                    0            9       29.6       100    
## 23 cloudcover_high                   0           15       38.8       100    
## 24 cloudcover                        0           33       44.2       100    
## 25 relativehumidity_2m              15           73       71.2       100    
## 26 windgusts_10m                     2.2         22       24.0       122.   
## 27 winddirection_10m                 1          227      208.        360    
## 28 winddirection_100m                1          228      209.        360    
## 29 diffuse_radiation                 0            5       55.9       453    
## 30 direct_radiation                  0            0      111.        912    
## 31 direct_normal_irradiance          0            0      199.        999.   
## 32 shortwave_radiation               0            7      167.       1006    
## 33 surface_pressure                962.        1012.    1012.       1040.   
## 34 pressure_msl                    966.        1017.    1017.       1044.
# Outlier wind data (includes Sandy 2012-10-29, as well as several other dates)
nycTemp %>%
    filter(windspeed_10m>60 | windgusts_10m>100) %>%
    select(time, windspeed_10m, windgusts_10m, everything())
## # A tibble: 18 × 73
##    time                windspeed_10m windgusts_10m date        hour
##    <dttm>                      <dbl>         <dbl> <date>     <int>
##  1 2010-01-25 13:00:00          38            106. 2010-01-25    13
##  2 2010-01-25 14:00:00          35.6          107. 2010-01-25    14
##  3 2012-10-29 13:00:00          48.5          104. 2012-10-29    13
##  4 2012-10-29 16:00:00          59.1          122. 2012-10-29    16
##  5 2012-10-29 17:00:00          60.6          114. 2012-10-29    17
##  6 2012-10-29 18:00:00          54.7          114. 2012-10-29    18
##  7 2012-10-29 19:00:00          56.7          107. 2012-10-29    19
##  8 2012-10-29 20:00:00          59.1          108  2012-10-29    20
##  9 2012-10-29 21:00:00          57.1          109. 2012-10-29    21
## 10 2012-10-29 22:00:00          53.2          106. 2012-10-29    22
## 11 2012-10-29 23:00:00          49.8          103. 2012-10-29    23
## 12 2012-12-21 09:00:00          39.4          106. 2012-12-21     9
## 13 2012-12-21 10:00:00          26.8          108. 2012-12-21    10
## 14 2013-01-31 04:00:00          40            104. 2013-01-31     4
## 15 2013-01-31 06:00:00          30.9          101. 2013-01-31     6
## 16 2019-11-01 01:00:00          34.3          102. 2019-11-01     1
## 17 2020-04-13 14:00:00          36.4          100. 2020-04-13    14
## 18 2020-08-04 14:00:00          47.2          110. 2020-08-04    14
## # ℹ 68 more variables: temperature_2m <dbl>, relativehumidity_2m <int>,
## #   dewpoint_2m <dbl>, apparent_temperature <dbl>, pressure_msl <dbl>,
## #   surface_pressure <dbl>, precipitation <dbl>, rain <dbl>, snowfall <dbl>,
## #   cloudcover <int>, cloudcover_low <int>, cloudcover_mid <int>,
## #   cloudcover_high <int>, shortwave_radiation <dbl>, direct_radiation <dbl>,
## #   direct_normal_irradiance <dbl>, diffuse_radiation <dbl>,
## #   windspeed_100m <dbl>, winddirection_10m <int>, winddirection_100m <int>, …
keyDates <- nycTemp %>%
    filter(windspeed_10m>60 | windgusts_10m>100) %>%
    pull(date) %>%
    unique()
nycTemp %>%
    ggplot(aes(x=time)) +
    geom_line(aes(y=windgusts_10m)) + 
    geom_vline(xintercept = lubridate::ymd_hm(paste0(keyDates, "T12:00")), color="red", lty=2)

Random variables and additional explanatory variables are created, and test-train data are split:

# Add random variables to dataset, then split in to test and train
set.seed(23100415)
nycTempRand <- nycTemp %>%
    mutate(pct_0005=sample(0:5, size=nrow(.), replace=TRUE),
           pct_0025=sample(0:25, size=nrow(.), replace=TRUE), 
           pct_0100=sample(0:100, size=nrow(.), replace=TRUE), 
           pct_0250=sample(0:250, size=nrow(.), replace=TRUE),
           pct_0500=sample(0:500, size=nrow(.), replace=TRUE), 
           pct_1000=sample(0:1000, size=nrow(.), replace=TRUE), 
           pct_2500=sample(0:2500, size=nrow(.), replace=TRUE), 
           pct_5000=sample(0:5000, size=nrow(.), replace=TRUE), 
           tod=ifelse(hour>=7 & hour<=18, "Day", "Night"), 
           season=case_when(month %in% c("Mar", "Apr", "May") ~ "Spring", 
                            month %in% c("Jun", "Jul", "Aug") ~ "Summer", 
                            month %in% c("Sep", "Oct", "Nov") ~ "Fall", 
                            month %in% c("Dec", "Jan", "Feb") ~ "Winter", 
                            TRUE~"typo"
                            ), 
           todSeason=paste0(season, "-", tod), 
           tod=factor(tod, levels=c("Day", "Night")), 
           season=factor(season, levels=c("Spring", "Summer", "Fall", "Winter")), 
           todSeason=factor(todSeason, 
                            levels=paste0(rep(c("Spring", "Summer", "Fall", "Winter"), each=2), 
                                          "-", 
                                          c("Day", "Night")
                                          )
                            )
           )
nycTempRand %>% count(tod)
## # A tibble: 2 × 2
##   tod       n
##   <fct> <int>
## 1 Day   58968
## 2 Night 58968
nycTempRand %>% count(season)
## # A tibble: 4 × 2
##   season     n
##   <fct>  <int>
## 1 Spring 30912
## 2 Summer 29064
## 3 Fall   28392
## 4 Winter 29568
nycTempRand %>% count(todSeason)
## # A tibble: 8 × 2
##   todSeason        n
##   <fct>        <int>
## 1 Spring-Day   15456
## 2 Spring-Night 15456
## 3 Summer-Day   14532
## 4 Summer-Night 14532
## 5 Fall-Day     14196
## 6 Fall-Night   14196
## 7 Winter-Day   14784
## 8 Winter-Night 14784
# Split in to test and train data (3:1 split in favor of test)
idxTrain <- sort(sample(1:nrow(nycTempRand), size=round(0.75*nrow(nycTempRand)), replace=FALSE))
nycTempTrain <- nycTempRand[idxTrain, ]
nycTempTest <- nycTempRand[-idxTrain, ]

K-means is run using 1-15 centers, cached to reduce processing time:

# Create set of relevant training variables
varsTrain <- nycTempTrain %>%
    select(starts_with("pct")) %>%
    select(-pct_hour, -pct_weathercode, -ends_with("0"), -ends_with("5")) %>%
    names()
varsTrain
##  [1] "pct_temperature_2m"                "pct_relativehumidity_2m"          
##  [3] "pct_dewpoint_2m"                   "pct_apparent_temperature"         
##  [5] "pct_pressure_msl"                  "pct_surface_pressure"             
##  [7] "pct_precipitation"                 "pct_rain"                         
##  [9] "pct_snowfall"                      "pct_cloudcover"                   
## [11] "pct_cloudcover_low"                "pct_cloudcover_mid"               
## [13] "pct_cloudcover_high"               "pct_shortwave_radiation"          
## [15] "pct_direct_radiation"              "pct_direct_normal_irradiance"     
## [17] "pct_diffuse_radiation"             "pct_windspeed_10m"                
## [19] "pct_windspeed_100m"                "pct_winddirection_10m"            
## [21] "pct_winddirection_100m"            "pct_windgusts_10m"                
## [23] "pct_et0_fao_evapotranspiration"    "pct_vapor_pressure_deficit"       
## [25] "pct_soil_temperature_0_to_7cm"     "pct_soil_temperature_7_to_28cm"   
## [27] "pct_soil_temperature_28_to_100cm"  "pct_soil_temperature_100_to_255cm"
## [29] "pct_soil_moisture_0_to_7cm"        "pct_soil_moisture_7_to_28cm"      
## [31] "pct_soil_moisture_28_to_100cm"     "pct_soil_moisture_100_to_255cm"
# Create k-means for 1-15 clusters (need to fix function for returning km object defaults)
t <- proc.time()
nycKMList <- lapply(1:15, FUN=function(x) runKMeans(nycTempTrain, 
                                                    vars=varsTrain, 
                                                    centers=x, 
                                                    nStart=25, 
                                                    seed=23100515, 
                                                    iter.max=50L,
                                                    plotMeans=FALSE, 
                                                    plotPct=NULL, 
                                                    returnKM=TRUE
                                                    )
                 )
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 4422600)

## Warning: Quick-TRANSfer stage steps exceeded maximum (= 4422600)

## Warning: Quick-TRANSfer stage steps exceeded maximum (= 4422600)

## Warning: Quick-TRANSfer stage steps exceeded maximum (= 4422600)

## Warning: Quick-TRANSfer stage steps exceeded maximum (= 4422600)

## Warning: Quick-TRANSfer stage steps exceeded maximum (= 4422600)

## Warning: Quick-TRANSfer stage steps exceeded maximum (= 4422600)

## Warning: Quick-TRANSfer stage steps exceeded maximum (= 4422600)

## Warning: Quick-TRANSfer stage steps exceeded maximum (= 4422600)

## Warning: Quick-TRANSfer stage steps exceeded maximum (= 4422600)

## Warning: Quick-TRANSfer stage steps exceeded maximum (= 4422600)

## Warning: Quick-TRANSfer stage steps exceeded maximum (= 4422600)

## Warning: Quick-TRANSfer stage steps exceeded maximum (= 4422600)

## Warning: Quick-TRANSfer stage steps exceeded maximum (= 4422600)

## Warning: Quick-TRANSfer stage steps exceeded maximum (= 4422600)

## Warning: Quick-TRANSfer stage steps exceeded maximum (= 4422600)

## Warning: Quick-TRANSfer stage steps exceeded maximum (= 4422600)

## Warning: Quick-TRANSfer stage steps exceeded maximum (= 4422600)

## Warning: Quick-TRANSfer stage steps exceeded maximum (= 4422600)

## Warning: Quick-TRANSfer stage steps exceeded maximum (= 4422600)

## Warning: Quick-TRANSfer stage steps exceeded maximum (= 4422600)

## Warning: Quick-TRANSfer stage steps exceeded maximum (= 4422600)

## Warning: Quick-TRANSfer stage steps exceeded maximum (= 4422600)

## Warning: Quick-TRANSfer stage steps exceeded maximum (= 4422600)

## Warning: Quick-TRANSfer stage steps exceeded maximum (= 4422600)

## Warning: Quick-TRANSfer stage steps exceeded maximum (= 4422600)

## Warning: Quick-TRANSfer stage steps exceeded maximum (= 4422600)

## Warning: Quick-TRANSfer stage steps exceeded maximum (= 4422600)

## Warning: Quick-TRANSfer stage steps exceeded maximum (= 4422600)

## Warning: Quick-TRANSfer stage steps exceeded maximum (= 4422600)

## Warning: Quick-TRANSfer stage steps exceeded maximum (= 4422600)

## Warning: Quick-TRANSfer stage steps exceeded maximum (= 4422600)

## Warning: Quick-TRANSfer stage steps exceeded maximum (= 4422600)

## Warning: Quick-TRANSfer stage steps exceeded maximum (= 4422600)
proc.time()-t
##    user  system elapsed 
##  243.17    2.06  246.28

Change in SS-between is explored based on number of clusters:

dfSS <- sapply(nycKMList, FUN=function(x) c("nCluster"=length(x$size), 
                                            "totss"=x$totss, 
                                            "betweenss"=x$betweenss, 
                                            "tot.withinss"=x$tot.withinss, 
                                            "iter"=x$iter, 
                                            "ifault"=unclass(ifelse(is.null(x$ifault), 0, x$ifault))
                                            )
               ) %>% 
    t() %>% 
    tibble::as_tibble() %>% 
    mutate(pct=tot.withinss/totss, dpct=pct-lag(pct)) 
dfSS
## # A tibble: 15 × 8
##    nCluster       totss betweenss tot.withinss  iter ifault   pct     dpct
##       <dbl>       <dbl>     <dbl>        <dbl> <dbl>  <dbl> <dbl>    <dbl>
##  1        1 2710013341.  -2.79e-3  2710013341.     1      0 1.00  NA      
##  2        2 2710013341.   5.98e+8  2112184370.     1      0 0.779 -0.221  
##  3        3 2710013341.   8.48e+8  1862002274.     3      0 0.687 -0.0923 
##  4        4 2710013341.   1.06e+9  1655004466.     4      0 0.611 -0.0764 
##  5        5 2710013341.   1.24e+9  1473038346.     4      0 0.544 -0.0671 
##  6        6 2710013341.   1.30e+9  1406093293.     5      0 0.519 -0.0247 
##  7        7 2710013341.   1.35e+9  1355717649.     6      0 0.500 -0.0186 
##  8        8 2710013341.   1.40e+9  1308067061.     6      0 0.483 -0.0176 
##  9        9 2710013341.   1.44e+9  1269217987.     8      0 0.468 -0.0143 
## 10       10 2710013341.   1.48e+9  1233929474.    12      0 0.455 -0.0130 
## 11       11 2710013341.   1.51e+9  1201664148.     7      0 0.443 -0.0119 
## 12       12 2710013341.   1.54e+9  1171837975.     9      0 0.432 -0.0110 
## 13       13 2710013341.   1.57e+9  1144415346.     9      0 0.422 -0.0101 
## 14       14 2710013341.   1.59e+9  1121265060.     8      0 0.414 -0.00854
## 15       15 2710013341.   1.61e+9  1102405182.     8      0 0.407 -0.00696
dfSS %>% 
    ggplot(aes(x=nCluster, y=pmin(1, pct))) + 
    geom_point() + 
    geom_line() + 
    lims(y=c(0, 1)) + 
    labs(x="# Clusters", 
         y="Within SS / Total SS", 
         title="Sum-squares ratio by number of clusters (k=means)"
         ) + 
    geom_text(aes(y=pct-0.05, label=round(pct, 3)), size=2.5)

dfSS %>% 
    filter(!is.na(dpct)) %>% 
    ggplot(aes(x=factor(nCluster), y=dpct)) + 
    geom_col(fill="lightblue") + 
    labs(x="When adding this cluster", 
         y="Change in (Within SS / Total SS)", 
         title="Sum-squares ratio by number of clusters (k=means)"
         ) + 
    geom_text(aes(y=dpct/2, label=round(dpct, 3)), size=2.5)

This is suggestive that exploring evolution of data splits at k = 1, 2, 3, 4, 5 may be informative

The assignKMeans() functionality is tested:

# Example of returning distance data
glimpse(assignKMeans(km=nycKMList[[10]], df=nycTempTrain, returnAllDistanceData=TRUE))
## Rows: 88,452
## Columns: 11
## $ V1  <dbl> 291.1435, 286.5308, 245.5082, 250.7208, 246.9744, 243.3741, 246.27…
## $ V2  <dbl> 323.4492, 326.5322, 289.4243, 286.3461, 279.3103, 278.8232, 253.60…
## $ V3  <dbl> 188.7837, 197.5598, 226.4654, 221.9406, 211.6541, 211.3554, 210.01…
## $ V4  <dbl> 275.3905, 275.5157, 234.9525, 237.1705, 231.7343, 229.0003, 199.43…
## $ V5  <dbl> 261.0590, 263.9333, 222.6504, 219.1689, 210.4299, 209.5118, 217.62…
## $ V6  <dbl> 358.7661, 354.8273, 322.1453, 325.5176, 322.7495, 320.4959, 294.47…
## $ V7  <dbl> 283.9546, 288.0497, 305.6469, 302.8877, 296.2321, 295.9944, 274.38…
## $ V8  <dbl> 307.6029, 299.4418, 256.8930, 263.4953, 261.7918, 257.9137, 226.34…
## $ V9  <dbl> 199.8582, 202.7813, 145.6591, 144.7558, 133.0292, 130.2691, 139.80…
## $ V10 <dbl> 248.5199, 239.4626, 186.2984, 196.5638, 194.4515, 188.2631, 192.11…
## $ cl  <int> 3, 3, 9, 9, 9, 9, 9, 4, 4, 4, 4, 4, 4, 4, 9, 9, 9, 9, 9, 3, 3, 9, …
# Confirmation that cluster assignments match (could occasionally have a tied distance and possible mismatch)
table(assignKMeans(km=nycKMList[[10]], df=nycTempTrain), nycKMList[[10]]$cluster)
##     
##          1     2     3     4     5     6     7     8     9    10
##   1   9397     0     0     0     0     0     0     0     0     0
##   2      0  9719     0     0     0     0     0     0     0     0
##   3      0     0  7010     0     0     0     0     0     0     0
##   4      0     0     0  8805     0     0     0     0     0     0
##   5      0     0     0     0  8498     0     0     0     0     0
##   6      0     0     0     0     0 11024     0     0     0     0
##   7      0     0     0     0     0     0  3734     0     0     0
##   8      0     0     0     0     0     0     0  8534     0     0
##   9      0     0     0     0     0     0     0     0 10298     0
##   10     0     0     0     0     0     0     0     0     0 11433

The function successfully assigns points to the nearest centroid, consistent with the output of kmeans()

The output of 4 k-means clusters is explored:

# Function run on the 4-cluster k-means object
runKMeans(df=nycTempTrain, 
          km=nycKMList[[4]], 
          plotMeans=TRUE, 
          plotPct=list(c("todSeason"), c("hour", "month")), 
          nrowPct=1, 
          nrowMeans=1
          )

Clusters include the four quadrants of day/night and warm/cold season

The output of 5 k-means clusters is explored:

# Function run on the 5-cluster k-means object
runKMeans(df=nycTempTrain, 
          km=nycKMList[[5]], 
          plotMeans=TRUE, 
          plotPct=list(c("todSeason"), c("hour", "month")), 
          nrowPct=1, 
          nrowMeans=1
          )

# Exploring precipitation by cluster
nycTempTrain %>% 
    mutate(cl=nycKMList[[5]]$cluster) %>% 
    group_by(cl) %>% 
    summarize(n=n(), 
              precip=sum(precipitation), 
              rain=sum(rain), 
              snow=sum(snowfall), 
              pct_hrs_precip=mean(precipitation>0)
              ) %>%
    arrange(desc(precip))
## # A tibble: 5 × 6
##      cl     n precip   rain  snow pct_hrs_precip
##   <int> <int>  <dbl>  <dbl> <dbl>          <dbl>
## 1     3 10065 9901.  9066.  584.          0.965 
## 2     2 21766 1018.  1018.    0           0.0711
## 3     1 17965  120.   120.    0           0.0150
## 4     5 20844  103.     1.6  81.8         0.0151
## 5     4 17812   48.5   16.8  28.8         0.0133

Clusters include precipitation/no, followed by the four quadrants of day/night and warm/cold season

Principal component analysis is run to explore variance explained by number of components:

# Correlation analysis
corTrain <- cor(nycTempTrain[, varsTrain])
hcTrain <- hclust(as.dist((1-corTrain)/2))
orderTrain <- hcTrain$order %>% purrr::set_names(hcTrain$labels)
tmpHeat <- as.data.frame(corTrain, row.names=rownames(corTrain)) %>% 
    rownames_to_column("var1") %>% 
    tibble::as_tibble() %>% 
    pivot_longer(cols=-c("var1"), names_to="var2") %>%
    mutate()
tmpHeat %>%
    ggplot(aes(x=fct_reorder(var1, orderTrain[var1]), y=fct_reorder(var2, orderTrain[var2]))) + 
    geom_tile(aes(fill=value)) + 
    geom_text(aes(label=round(value, 2)), size=2) + 
    scale_fill_gradient2(low="red", mid="white", high="green") + 
    labs(x=NULL, y=NULL, title="Correlations") + 
    theme(axis.text.x=element_text(angle=90, vjust=0.5, hjust=1))

pcaTrain <- prcomp(nycTempTrain[, varsTrain])
summary(pcaTrain)
## Importance of components:
##                            PC1     PC2     PC3      PC4      PC5      PC6
## Standard deviation     92.9713 76.8021 66.0539 51.40618 40.00159 36.42079
## Proportion of Variance  0.2821  0.1925  0.1424  0.08625  0.05223  0.04329
## Cumulative Proportion   0.2821  0.4746  0.6170  0.70329  0.75552  0.79881
##                             PC7      PC8      PC9     PC10     PC11     PC12
## Standard deviation     33.96092 32.67399 29.09379 27.76099 24.21620 20.54141
## Proportion of Variance  0.03764  0.03484  0.02763  0.02515  0.01914  0.01377
## Cumulative Proportion   0.83646  0.87130  0.89893  0.92408  0.94322  0.95699
##                            PC13    PC14     PC15     PC16     PC17   PC18
## Standard deviation     14.33743 14.1073 13.59159 12.58364 10.51321 9.4187
## Proportion of Variance  0.00671  0.0065  0.00603  0.00517  0.00361 0.0029
## Cumulative Proportion   0.96370  0.9702  0.97623  0.98140  0.98500 0.9879
##                           PC19    PC20    PC21    PC22    PC23    PC24    PC25
## Standard deviation     8.59093 8.19193 7.54046 6.67977 5.94066 5.28136 4.33450
## Proportion of Variance 0.00241 0.00219 0.00186 0.00146 0.00115 0.00091 0.00061
## Cumulative Proportion  0.99031 0.99250 0.99436 0.99581 0.99696 0.99787 0.99849
##                           PC26    PC27    PC28    PC29    PC30    PC31   PC32
## Standard deviation     3.98204 3.86099 2.78793 2.07996 1.60886 0.88819 0.3374
## Proportion of Variance 0.00052 0.00049 0.00025 0.00014 0.00008 0.00003 0.0000
## Cumulative Proportion  0.99900 0.99949 0.99974 0.99989 0.99997 1.00000 1.0000
tibble::tibble(sd=pcaTrain$sdev, var=sd**2, n=1:length(pcaTrain$sdev)) %>%
    ggplot(aes(x=n)) + 
    geom_col(aes(y=var/sum(var)), fill="lightblue") +
    geom_text(aes(y=cumsum(var)/sum(var), label=round(cumsum(var)/sum(var), 2)), hjust=0, size=2.5) + 
    geom_line(aes(y=cumsum(var)/sum(var))) +
    labs(x="Component", y="Variance Explained", title="Variance Explained (cumulative and incremental)")

Around 75% of variance is explained by the first 5 components, with around 90% explained by the first 10 components

A simple random forest classification is explored, for prediction of month:

# Simple random forest model
rfTempTrainMonth <- ranger::ranger(month ~ ., 
                                   data=nycTempTrain[, c('month', varsTrain)], 
                                   importance = "impurity"
                                   )
rfTempTrainMonth
## Ranger result
## 
## Call:
##  ranger::ranger(month ~ ., data = nycTempTrain[, c("month", varsTrain)],      importance = "impurity") 
## 
## Type:                             Classification 
## Number of trees:                  500 
## Sample size:                      88452 
## Number of independent variables:  32 
## Mtry:                             5 
## Target node size:                 1 
## Variable importance mode:         impurity 
## Splitrule:                        gini 
## OOB prediction error:             0.30 %
# Variable importance
rfTempTrainMonth$variable.importance %>% 
    as.data.frame() %>% 
    purrr::set_names("imp") %>% 
    rownames_to_column("metric") %>% 
    tibble::as_tibble() %>%
    ggplot(aes(x=fct_reorder(metric, imp), y=imp/1000)) + 
    geom_col(fill="lightblue") + 
    labs(x=NULL, y="Variable Importance (000)", title="Simple random forest to predict month") +
    coord_flip()

# Performance on test data (confirm >99% accuracy)
rfTempTest <- nycTempTest %>%
    mutate(pred=predict(rfTempTrainMonth, data=.)$predictions)
cat("\nAccuracy on test dataset is: ", round(100*mean(rfTempTest$pred==rfTempTest$month), 2), "%\n", sep="")
## 
## Accuracy on test dataset is: 99.75%
rfTempTest %>%
    count(month, pred) %>%
    ggplot(aes(x=pred, y=month)) + 
    geom_tile(aes(fill=n)) + 
    geom_text(aes(label=n), size=2.5) +
    scale_fill_continuous("", low="white", high="green") + 
    labs(x="Predicted month", y="Actual month", title="Predicting month on test data")

The simple random forest has over 99% predictive accuracy on month, primarily focusing on metrics related to soil (soil temperature and soil moisture at various depths).

A portion of the predictive accuracy may be based on specific soil trends during a given year, as it is very unlikely that data consistently change right at 00h00 of a new month. Models are run using a holdout year:

# Simple random forest model, holding out 2022 data
rfTempHoldout <- ranger::ranger(month ~ ., 
                                data=nycTempTrain[year(nycTempTrain$date) != 2022, c('month', varsTrain)], 
                                importance = "impurity"
                                )
rfTempHoldout
## Ranger result
## 
## Call:
##  ranger::ranger(month ~ ., data = nycTempTrain[year(nycTempTrain$date) !=      2022, c("month", varsTrain)], importance = "impurity") 
## 
## Type:                             Classification 
## Number of trees:                  500 
## Sample size:                      81858 
## Number of independent variables:  32 
## Mtry:                             5 
## Target node size:                 1 
## Variable importance mode:         impurity 
## Splitrule:                        gini 
## OOB prediction error:             0.29 %
# Performance on holdout data
rfTempTest <- nycTempTrain %>%
    bind_rows(nycTempTest) %>%
    filter(year(date)==2022) %>%
    mutate(pred=predict(rfTempHoldout, data=.)$predictions)
cat("\nAccuracy on holdout 2022 data is: ", round(100*mean(rfTempTest$pred==rfTempTest$month), 2), "%\n", sep="")
## 
## Accuracy on holdout 2022 data is: 82.32%
rfTempTest %>%
    count(month, pred) %>%
    ggplot(aes(x=pred, y=month)) + 
    geom_tile(aes(fill=n)) + 
    geom_text(aes(label=n), size=2.5) +
    scale_fill_continuous("", low="white", high="green") + 
    labs(x="Predicted month (2022)", 
         y="Actual month (2022)", 
         title="Applying random forest fit without 2022 data to 2022"
         )

Without access to training data including 2022, the model still makes good predictions for 2022 month. But, predictions are commonly off by +/- 1 month leading to overall accuracy of ~80% (vs. 99%+ when able to train on soil heating patterns in the given year)

The process for running random forest is converted to functional form:

runSimpleRF <- function(df, yVar, xVars=NULL, ...) {

    # FUNCTION ARGUMENTS:
    # df: data frame containing observations
    # yVar: variable to be predicted (numeric for regression, categorical for classification)
    # xVars: predictor variables (NULL means everything in df except for yVar)
    # ...: other arguments passed to ranger::ranger
    
    # Create xVars if passed as NULL
    if(is.null(xVars)) xVars <- setdiff(names(df), yVar)
    
    # Simple random forest model
    ranger::ranger(as.formula(paste0(yVar, "~", paste0(xVars, collapse="+"))), 
                   data=df[, c(yVar, xVars)], 
                   ...
                   )
    
}

The function is checked for consistency:

tstRF <- runSimpleRF(df=nycTempTrain, yVar="month", xVars=varsTrain, importance="impurity")
tstRF
## Ranger result
## 
## Call:
##  ranger::ranger(as.formula(paste0(yVar, "~", paste0(xVars, collapse = "+"))),      data = df[, c(yVar, xVars)], ...) 
## 
## Type:                             Classification 
## Number of trees:                  500 
## Sample size:                      88452 
## Number of independent variables:  32 
## Mtry:                             5 
## Target node size:                 1 
## Variable importance mode:         impurity 
## Splitrule:                        gini 
## OOB prediction error:             0.28 %
# Comparison of variable importance
rfTempTrainMonth$variable.importance %>% 
    as.data.frame() %>% 
    purrr::set_names("imp") %>% 
    rownames_to_column("metric") %>% 
    tibble::as_tibble() %>%
    mutate(type="original") %>%
    bind_rows(tstRF$variable.importance %>% 
                  as.data.frame() %>% 
                  purrr::set_names("imp") %>% 
                  rownames_to_column("metric") %>% 
                  tibble::as_tibble() %>%
                  mutate(type="formula")
              ) %>%
    ggplot(aes(x=fct_reorder(metric, imp), y=imp/1000)) + 
    geom_point(aes(color=type)) + 
    labs(x=NULL, y="Variable Importance (000)", title="Simple random forest to predict month") +
    coord_flip()

Aside from minor differences due to different random conditions (no seed was set prior to either run), the function and original call produce substantially the same outputs

A function is written for plotting variable importance:

plotRFImportance <- function(rf, 
                             impName="variable.importance", 
                             divBy=1000, 
                             plotTitle=NULL, 
                             plotData=TRUE, 
                             returnData=!isTRUE(plotData)
                             ) {
    
    # FUNCTION ARGUMENTS:
    # rf: output list from random forest with an element for importance
    # impName: name of the element to extract from rf
    # divBy: divisor for the importance variable
    # plotTitle: title for plot (NULL means use default)
    # plotData: boolean, should the importance plot be created and printed?
    # returnData: boolean, should the processed data be returned?
    
    # Create title if not provided
    if(is.null(plotTitle)) plotTitle <- "Importance for simple random forest"

    # Create y-axis label
    yAxisLabel="Variable Importance"
    if(!isTRUE(all.equal(divBy, 1))) yAxisLabel <- paste0(yAxisLabel, " (", divBy, "s)")
    
    # Create variable importance
    df <- rf[[impName]] %>% 
        as.data.frame() %>% 
        purrr::set_names("imp") %>% 
        rownames_to_column("metric") %>% 
        tibble::as_tibble() 
    
    # Create and print plot if requested
    if(isTRUE(plotData)) {
        p1 <- df %>%
            ggplot(aes(x=fct_reorder(metric, imp), y=imp/divBy)) + 
            geom_col(fill="lightblue") + 
            labs(x=NULL, y=yAxisLabel, title=plotTitle) +
            coord_flip()
        print(p1)
    }
    
    # Return data if requested
    if(isTRUE(returnData)) return(df)
    
}

# Example for plot only
plotRFImportance(tstRF)

# Example for data only
plotRFImportance(tstRF, plotData=FALSE)
## # A tibble: 32 × 2
##    metric                      imp
##    <chr>                     <dbl>
##  1 pct_temperature_2m       2053. 
##  2 pct_relativehumidity_2m   613. 
##  3 pct_dewpoint_2m          1758. 
##  4 pct_apparent_temperature 2240. 
##  5 pct_pressure_msl         1186. 
##  6 pct_surface_pressure     1185. 
##  7 pct_precipitation          89.5
##  8 pct_rain                   82.4
##  9 pct_snowfall               20.9
## 10 pct_cloudcover            461. 
## # ℹ 22 more rows

A function is written for applying the model to test (or other) data:

# Ranger needs to be explicitly in the namespace for predictions
library(ranger)
## Warning: package 'ranger' was built under R version 4.2.3
predictRF <- function(rf, df, newCol="pred", predsOnly=FALSE) {
    
    # FUNCTION ARGUMENTS:
    # rf: a trained random forest model
    # df: data frame for adding predictions
    # newCol: name for new column to be added to df
    # predsOnly: boolean, should only the vector of predictions be returned?
    #            if FALSE, a column named newCol is added to df, with df returned

    # Performance on holdout data
    preds <- predict(rf, data=df)$predictions
    
    # Return just the predictions if requested otherwise add as final column to df
    if(isTRUE(predsOnly)) return(preds)
    else {
        df[newCol] <- preds
        return(df)
    }
    
}

# Predictions only
summary(predictRF(rf=tstRF, df=nycTempTest, predsOnly=TRUE))
##  Jan  Feb  Mar  Apr  May  Jun  Jul  Aug  Sep  Oct  Nov  Dec 
## 2629 2435 2628 2547 2525 2449 2332 2390 2361 2446 2375 2367
# Full data frame
glimpse(predictRF(rf=tstRF, df=nycTempTest))
## Rows: 29,484
## Columns: 85
## $ time                              <dttm> 2010-01-01 00:00:00, 2010-01-01 01:…
## $ date                              <date> 2010-01-01, 2010-01-01, 2010-01-01,…
## $ hour                              <int> 0, 1, 4, 16, 22, 1, 3, 5, 6, 7, 9, 2…
## $ temperature_2m                    <dbl> -1.1, -1.0, -0.9, 5.0, -1.2, -1.2, -…
## $ relativehumidity_2m               <int> 95, 96, 97, 72, 89, 87, 77, 72, 72, …
## $ dewpoint_2m                       <dbl> -1.7, -1.6, -1.3, 0.4, -2.8, -3.0, -…
## $ apparent_temperature              <dbl> -3.9, -3.9, -3.7, 2.2, -5.1, -6.2, -…
## $ pressure_msl                      <dbl> 1017.2, 1016.5, 1015.6, 1010.3, 1008…
## $ surface_pressure                  <dbl> 1012.6, 1011.9, 1011.0, 1005.8, 1003…
## $ precipitation                     <dbl> 0.5, 0.5, 0.1, 0.0, 0.0, 0.2, 0.0, 0…
## $ rain                              <dbl> 0.0, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0…
## $ snowfall                          <dbl> 0.35, 0.28, 0.07, 0.00, 0.00, 0.14, …
## $ cloudcover                        <int> 90, 93, 71, 100, 94, 100, 100, 100, …
## $ cloudcover_low                    <int> 2, 8, 15, 84, 39, 97, 98, 85, 80, 67…
## $ cloudcover_mid                    <int> 98, 96, 95, 44, 98, 91, 96, 97, 86, …
## $ cloudcover_high                   <int> 97, 93, 0, 84, 0, 0, 0, 0, 0, 0, 0, …
## $ shortwave_radiation               <dbl> 0, 0, 0, 170, 0, 0, 0, 0, 0, 0, 11, …
## $ direct_radiation                  <dbl> 0, 0, 0, 55, 0, 0, 0, 0, 0, 0, 1, 0,…
## $ direct_normal_irradiance          <dbl> 0.0, 0.0, 0.0, 185.3, 0.0, 0.0, 0.0,…
## $ diffuse_radiation                 <dbl> 0, 0, 0, 115, 0, 0, 0, 0, 0, 0, 10, …
## $ windspeed_10m                     <dbl> 3.1, 3.5, 3.5, 4.6, 9.0, 16.6, 18.1,…
## $ windspeed_100m                    <dbl> 3.8, 3.1, 6.4, 9.9, 20.5, 30.3, 30.7…
## $ winddirection_10m                 <int> 339, 336, 336, 309, 299, 309, 317, 3…
## $ winddirection_100m                <int> 41, 21, 344, 314, 308, 314, 321, 316…
## $ windgusts_10m                     <dbl> 9.0, 9.7, 7.6, 12.2, 16.2, 28.4, 34.…
## $ et0_fao_evapotranspiration        <dbl> 0.00, 0.00, 0.00, 0.07, 0.00, 0.00, …
## $ weathercode                       <int> 73, 73, 71, 3, 3, 71, 71, 71, 71, 71…
## $ vapor_pressure_deficit            <dbl> 0.03, 0.02, 0.02, 0.24, 0.06, 0.07, …
## $ soil_temperature_0_to_7cm         <dbl> -0.7, -0.7, -0.6, 2.3, -0.3, -0.5, -…
## $ soil_temperature_7_to_28cm        <dbl> 0.1, 0.2, 0.2, 0.7, 0.9, 0.8, 0.8, 0…
## $ soil_temperature_28_to_100cm      <dbl> 4.2, 4.2, 4.1, 4.0, 4.0, 4.0, 3.9, 3…
## $ soil_temperature_100_to_255cm     <dbl> 10.6, 10.6, 10.6, 10.5, 10.5, 10.5, …
## $ soil_moisture_0_to_7cm            <dbl> 0.373, 0.374, 0.377, 0.385, 0.380, 0…
## $ soil_moisture_7_to_28cm           <dbl> 0.377, 0.377, 0.377, 0.376, 0.376, 0…
## $ soil_moisture_28_to_100cm         <dbl> 0.413, 0.413, 0.413, 0.411, 0.411, 0…
## $ soil_moisture_100_to_255cm        <dbl> 0.412, 0.412, 0.412, 0.412, 0.412, 0…
## $ origTime                          <chr> "2010-01-01T00:00", "2010-01-01T01:0…
## $ month                             <fct> Jan, Jan, Jan, Jan, Jan, Jan, Jan, J…
## $ fct_hour                          <fct> 0, 1, 4, 16, 22, 1, 3, 5, 6, 7, 9, 2…
## $ pct_hour                          <dbl> 0, 4, 17, 67, 92, 4, 13, 21, 25, 29,…
## $ pct_temperature_2m                <dbl> 10, 10, 11, 28, 10, 10, 11, 9, 9, 8,…
## $ pct_relativehumidity_2m           <dbl> 92, 94, 96, 47, 80, 75, 56, 47, 47, …
## $ pct_dewpoint_2m                   <dbl> 23, 24, 25, 29, 20, 20, 17, 13, 12, …
## $ pct_apparent_temperature          <dbl> 15, 15, 15, 31, 12, 9, 10, 8, 8, 7, …
## $ pct_pressure_msl                  <dbl> 53, 49, 44, 19, 12, 8, 8, 8, 7, 7, 6…
## $ pct_surface_pressure              <dbl> 51, 47, 42, 18, 11, 8, 8, 7, 7, 7, 6…
## $ pct_precipitation                 <dbl> 93, 93, 86, 0, 0, 88, 0, 0, 88, 0, 0…
## $ pct_rain                          <dbl> 0, 87, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ pct_snowfall                      <dbl> 99, 99, 98, 0, 0, 99, 98, 98, 99, 98…
## $ pct_cloudcover                    <dbl> 77, 79, 68, 81, 79, 81, 81, 81, 81, …
## $ pct_cloudcover_low                <dbl> 51, 60, 65, 84, 74, 88, 89, 84, 83, …
## $ pct_cloudcover_mid                <dbl> 90, 89, 88, 71, 90, 86, 89, 89, 84, …
## $ pct_cloudcover_high               <dbl> 81, 76, 0, 71, 0, 0, 0, 0, 0, 0, 0, …
## $ pct_shortwave_radiation           <dbl> 0, 0, 0, 67, 0, 0, 0, 0, 0, 0, 51, 0…
## $ pct_direct_radiation              <dbl> 0, 0, 0, 67, 0, 0, 0, 0, 0, 0, 52, 0…
## $ pct_direct_normal_irradiance      <dbl> 0, 0, 0, 67, 0, 0, 0, 0, 0, 0, 55, 0…
## $ pct_diffuse_radiation             <dbl> 0, 0, 0, 80, 0, 0, 0, 0, 0, 0, 52, 0…
## $ pct_windspeed_10m                 <dbl> 3, 4, 4, 8, 38, 83, 88, 84, 76, 82, …
## $ pct_windspeed_100m                <dbl> 2, 1, 6, 15, 57, 88, 88, 85, 75, 83,…
## $ pct_winddirection_10m             <dbl> 94, 93, 93, 83, 78, 83, 87, 84, 79, …
## $ pct_winddirection_100m            <dbl> 8, 4, 96, 85, 82, 85, 88, 86, 81, 84…
## $ pct_windgusts_10m                 <dbl> 3, 4, 1, 10, 25, 71, 84, 80, 78, 79,…
## $ pct_et0_fao_evapotranspiration    <dbl> 0, 0, 0, 61, 0, 0, 32, 32, 32, 32, 4…
## $ pct_weathercode                   <dbl> 99, 99, 98, 69, 69, 98, 98, 98, 98, …
## $ pct_vapor_pressure_deficit        <dbl> 2, 1, 1, 40, 6, 8, 22, 24, 24, 24, 2…
## $ pct_soil_temperature_0_to_7cm     <dbl> 6, 6, 7, 19, 9, 8, 9, 9, 8, 8, 6, 3,…
## $ pct_soil_temperature_7_to_28cm    <dbl> 7, 7, 7, 10, 11, 10, 10, 10, 10, 10,…
## $ pct_soil_temperature_28_to_100cm  <dbl> 16, 16, 15, 15, 15, 15, 14, 14, 14, …
## $ pct_soil_temperature_100_to_255cm <dbl> 42, 42, 42, 41, 41, 41, 41, 41, 41, …
## $ pct_soil_moisture_0_to_7cm        <dbl> 70, 71, 74, 79, 76, 76, 75, 74, 74, …
## $ pct_soil_moisture_7_to_28cm       <dbl> 69, 69, 69, 68, 68, 68, 68, 68, 68, …
## $ pct_soil_moisture_28_to_100cm     <dbl> 96, 96, 96, 95, 95, 95, 95, 95, 95, …
## $ pct_soil_moisture_100_to_255cm    <dbl> 96, 96, 96, 96, 96, 96, 96, 96, 96, …
## $ pct_0005                          <int> 3, 3, 2, 1, 2, 4, 2, 0, 0, 4, 4, 0, …
## $ pct_0025                          <int> 21, 11, 21, 20, 19, 9, 24, 8, 24, 16…
## $ pct_0100                          <int> 32, 30, 76, 47, 73, 79, 54, 68, 46, …
## $ pct_0250                          <int> 82, 35, 128, 213, 237, 225, 104, 100…
## $ pct_0500                          <int> 104, 158, 359, 192, 383, 456, 134, 2…
## $ pct_1000                          <int> 676, 214, 997, 987, 747, 330, 472, 9…
## $ pct_2500                          <int> 257, 1953, 2131, 2209, 1466, 241, 17…
## $ pct_5000                          <int> 2280, 1473, 2156, 3618, 2887, 1547, …
## $ tod                               <fct> Night, Night, Night, Day, Night, Nig…
## $ season                            <fct> Winter, Winter, Winter, Winter, Wint…
## $ todSeason                         <fct> Winter-Night, Winter-Night, Winter-N…
## $ pred                              <fct> Jan, Jan, Jan, Jan, Jan, Jan, Jan, J…

A function is written to report accuracy:

reportAccuracy <- function(df, 
                           trueCol, 
                           predCol="pred", 
                           reportAcc=TRUE, 
                           rndReport=2, 
                           useLabel="requested data",
                           returnAcc=!isTRUE(reportAcc)
                           ) {
    
    # FUNCTION ARGUMENTS:
    # df: data frame containing actual and predictions
    # trueCol: column containing true value
    # predCol: column containing predicted value
    # reportAcc: boolean, should accuracy be reported (printed to output)?
    # rndReport: number of significant digits for reporting (will be converted to percentage first)
    # useLabel: label for data to be used in reporting
    # returnAcc: boolean, should the accuracy be returned 
    #            return value is not converted to percentage, not rounded
    
    # For now, implemented only for categorical
    acc <- mean(df[trueCol]==df[predCol])
    
    # Report accuracy if requested
    if(isTRUE(reportAcc)) cat("\nAccuracy of ", useLabel, " is: ", round(100*acc, rndReport), "%\n", sep="")    
    
    # Return accuracy statistic if requested
    if(isTRUE(returnAcc)) return(acc)
    
}

reportAccuracy(predictRF(rf=tstRF, df=nycTempTest), trueCol="month")
## 
## Accuracy of requested data is: 99.76%
reportAccuracy(predictRF(rf=tstRF, df=nycTempTest), 
               trueCol="month", 
               rndReport=3, 
               useLabel="predictions applied to test dataset",
               returnAcc=TRUE
               )
## 
## Accuracy of predictions applied to test dataset is: 99.763%
## [1] 0.9976258

A function is written to plot confusion:

plotConfusion <- function(df, 
                          trueCol, 
                          predCol="pred", 
                          useTitle=NULL,
                          useSub=NULL
                           ) {
    
    # FUNCTION ARGUMENTS:
    # df: data frame containing actual and predictions
    # trueCol: column containing true value
    # predCol: column containing predicted value
    # useTitle: title to be used for chart (NULL means create from trueCol)
    # useSub: subtitle to be used for chart (NULL means none)
    
    # Create title if not supplied
    if(is.null(useTitle)) useTitle <- paste0("Predicting ", trueCol)
    
    # Confusion matrix (categorical)
    p1 <- df %>%
        group_by(across(all_of(c(trueCol, predCol)))) %>%
        summarize(n=n(), .groups="drop") %>%
        ggplot(aes(x=get(predCol), y=get(trueCol))) + 
        geom_tile(aes(fill=n)) + 
        geom_text(aes(label=n), size=2.5) +
        scale_fill_continuous("", low="white", high="green") + 
        labs(x="Predicted", y="Actual", title=useTitle, subtitle=useSub)
    print(p1)
    
}

plotConfusion(predictRF(rf=tstRF, df=nycTempTest), trueCol="month")

plotConfusion(predictRF(rf=tstRF, df=nycTempTest), trueCol="month", useSub="Model applied to test dataset")

Functions are run to build the same random forest excluding 2023 data, and with 2022 as a holdout year:

# 1. Run random forest
tstRFHoldout <- runSimpleRF(df=filter(nycTempTrain, lubridate::year(date)<2022), 
                            yVar="month", 
                            xVars=varsTrain, 
                            importance="impurity"
                            )
tstRFHoldout
## Ranger result
## 
## Call:
##  ranger::ranger(as.formula(paste0(yVar, "~", paste0(xVars, collapse = "+"))),      data = df[, c(yVar, xVars)], ...) 
## 
## Type:                             Classification 
## Number of trees:                  500 
## Sample size:                      78907 
## Number of independent variables:  32 
## Mtry:                             5 
## Target node size:                 1 
## Variable importance mode:         impurity 
## Splitrule:                        gini 
## OOB prediction error:             0.29 %
# 2. Plot variable importance
plotRFImportance(tstRFHoldout)

# 3. Predict on year 2022
tstRFPred <- predictRF(rf=tstRFHoldout, 
                       df=filter(bind_rows(nycTempTest, nycTempTrain), lubridate::year(date)==2022))
tstRFPred
## # A tibble: 8,760 × 85
##    time                date        hour temperature_2m relativehumidity_2m
##    <dttm>              <date>     <int>          <dbl>               <int>
##  1 2022-01-01 01:00:00 2022-01-01     1            8.9                  98
##  2 2022-01-01 02:00:00 2022-01-01     2            8.8                  98
##  3 2022-01-01 03:00:00 2022-01-01     3            8.9                  99
##  4 2022-01-01 13:00:00 2022-01-01    13           11.3                  96
##  5 2022-01-01 14:00:00 2022-01-01    14           12.1                  93
##  6 2022-01-01 18:00:00 2022-01-01    18           10.8                  99
##  7 2022-01-01 23:00:00 2022-01-01    23            9.9                  99
##  8 2022-01-02 01:00:00 2022-01-02     1            9.7                  98
##  9 2022-01-02 02:00:00 2022-01-02     2            9.7                  97
## 10 2022-01-02 17:00:00 2022-01-02    17           11                    87
## # ℹ 8,750 more rows
## # ℹ 80 more variables: dewpoint_2m <dbl>, apparent_temperature <dbl>,
## #   pressure_msl <dbl>, surface_pressure <dbl>, precipitation <dbl>,
## #   rain <dbl>, snowfall <dbl>, cloudcover <int>, cloudcover_low <int>,
## #   cloudcover_mid <int>, cloudcover_high <int>, shortwave_radiation <dbl>,
## #   direct_radiation <dbl>, direct_normal_irradiance <dbl>,
## #   diffuse_radiation <dbl>, windspeed_10m <dbl>, windspeed_100m <dbl>, …
# 4. Report on accuracy
reportAccuracy(tstRFPred, 
               trueCol="month", 
               rndReport=3, 
               useLabel="predictions based on pre-2022 training data applied to 2022 holdout dataset"
               )
## 
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 83.642%
# 5. Plot confusion data
plotConfusion(tstRFPred, 
              trueCol="month", 
              useSub="Model based on pre-2022 training data applied to 2022 holdout dataset"
              )

The model learns generalized trends, recording ~80% accuracy (always within +/- 1 month) for predictions applied to 2022 holdout data. Some of what the model learns is specific to one or more years in the non-holdout training dataset, accounting for >99% OOB accuracy in training data falling to ~80% accuracy in a holdout year

A main function is created for running all steps:

runFullRF <- function(dfTrain, 
                      yVar, 
                      xVars, 
                      dfTest=dfTrain,
                      useLabel="test data",
                      useSub=NULL, 
                      returnData=FALSE
                      ) {
    
    # FUNCTION ARGUMENTS:
    # dfTrain: training data
    # yVar: dependent variable
    # xVars: column(s) containing independent variables
    # dfTest: test dataset for applying predictions
    # useLabel: label to be used for reporting accuracy
    # useSub: subtitle to be used for confusion chart (NULL means none)
    # returnData: boolean, should data be returned?

    # 1. Run random forest using impurity for importance
    rf <- runSimpleRF(df=dfTrain, yVar=yVar, xVars=xVars, importance="impurity")

    # 2. Plot variable importance
    rfImp <- plotRFImportance(rf, returnData=TRUE)

    # 3. Predict on test dataset
    tstPred <- predictRF(rf=rf, df=dfTest)

    # 4. Report on accuracy
    rfAcc <- reportAccuracy(tstPred, trueCol=yVar, rndReport=3, useLabel=useLabel, returnAcc=TRUE)

    # 5. Plot confusion data
    plotConfusion(tstPred, trueCol=yVar, useSub=useSub)
    
    #6. Return data if requested
    if(isTRUE(returnData)) return(list(rf=rf, rfImp=rfImp, tstPred=tstPred, rfAcc=rfAcc))
    
}

The function is tested:

dfTrain <- filter(nycTempTrain, lubridate::year(date)<2022)
dfTest <- filter(bind_rows(nycTempTest, nycTempTrain), lubridate::year(date)==2022)
keyLabel <- "predictions based on pre-2022 training data applied to 2022 holdout dataset"

runFullRF(dfTrain=dfTrain, 
          yVar="month", 
          xVars=varsTrain, 
          dfTest=dfTest, 
          useLabel=keyLabel, 
          useSub=stringr::str_to_sentence(keyLabel)
          )

## 
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 82.991%

runFullRF(dfTrain=dfTrain, 
          yVar="month", 
          xVars=varsTrain, 
          dfTest=dfTest, 
          useLabel=keyLabel, 
          useSub=stringr::str_to_sentence(keyLabel), 
          returnData=TRUE
          ) %>%
    str(max.level=3)

## 
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 83.368%

## List of 4
##  $ rf     :List of 16
##   ..$ predictions              : Factor w/ 12 levels "Jan","Feb","Mar",..: 1 1 1 1 1 1 1 1 1 1 ...
##   ..$ num.trees                : num 500
##   ..$ num.independent.variables: num 32
##   ..$ mtry                     : num 5
##   ..$ min.node.size            : num 1
##   ..$ variable.importance      : Named num [1:32] 1815 550 1575 1757 1052 ...
##   .. ..- attr(*, "names")= chr [1:32] "pct_temperature_2m" "pct_relativehumidity_2m" "pct_dewpoint_2m" "pct_apparent_temperature" ...
##   ..$ prediction.error         : num 0.00284
##   ..$ forest                   :List of 9
##   .. ..$ num.trees                 : num 500
##   .. ..$ child.nodeIDs             :List of 500
##   .. ..$ split.varIDs              :List of 500
##   .. ..$ split.values              :List of 500
##   .. ..$ is.ordered                : logi [1:32] TRUE TRUE TRUE TRUE TRUE TRUE ...
##   .. ..$ class.values              : num [1:12] 1 2 3 4 5 6 7 8 9 10 ...
##   .. ..$ levels                    : chr [1:12] "Jan" "Feb" "Mar" "Apr" ...
##   .. ..$ independent.variable.names: chr [1:32] "pct_temperature_2m" "pct_relativehumidity_2m" "pct_dewpoint_2m" "pct_apparent_temperature" ...
##   .. ..$ treetype                  : chr "Classification"
##   .. ..- attr(*, "class")= chr "ranger.forest"
##   ..$ confusion.matrix         : 'table' int [1:12, 1:12] 6676 7 0 0 0 0 0 0 0 0 ...
##   .. ..- attr(*, "dimnames")=List of 2
##   ..$ splitrule                : chr "gini"
##   ..$ treetype                 : chr "Classification"
##   ..$ call                     : language ranger::ranger(as.formula(paste0(yVar, "~", paste0(xVars, collapse = "+"))),      data = df[, c(yVar, xVars)], ...)
##   .. ..- attr(*, "srcref")= 'srcref' int [1:8] 14 5 17 20 5 20 14 17
##   .. .. ..- attr(*, "srcfile")=Classes 'srcfilecopy', 'srcfile' <environment: 0x0000011cb28efa08> 
##   ..$ importance.mode          : chr "impurity"
##   ..$ num.samples              : int 78907
##   ..$ replace                  : logi TRUE
##   ..$ dependent.variable.name  : chr "month"
##   ..- attr(*, "class")= chr "ranger"
##  $ rfImp  : tibble [32 × 2] (S3: tbl_df/tbl/data.frame)
##   ..$ metric: chr [1:32] "pct_temperature_2m" "pct_relativehumidity_2m" "pct_dewpoint_2m" "pct_apparent_temperature" ...
##   ..$ imp   : num [1:32] 1815 550 1575 1757 1052 ...
##  $ tstPred: tibble [8,760 × 85] (S3: tbl_df/tbl/data.frame)
##   ..$ time                             : POSIXct[1:8760], format: "2022-01-01 01:00:00" "2022-01-01 02:00:00" ...
##   ..$ date                             : Date[1:8760], format: "2022-01-01" "2022-01-01" ...
##   ..$ hour                             : int [1:8760] 1 2 3 13 14 18 23 1 2 17 ...
##   ..$ temperature_2m                   : num [1:8760] 8.9 8.8 8.9 11.3 12.1 10.8 9.9 9.7 9.7 11 ...
##   ..$ relativehumidity_2m              : int [1:8760] 98 98 99 96 93 99 99 98 97 87 ...
##   ..$ dewpoint_2m                      : num [1:8760] 8.6 8.5 8.7 10.7 11 10.7 9.7 9.4 9.3 9 ...
##   ..$ apparent_temperature             : num [1:8760] 7.4 7.4 7.9 11 11.6 10 8.7 8.3 7.9 9.1 ...
##   ..$ pressure_msl                     : num [1:8760] 1013 1013 1012 1008 1008 ...
##   ..$ surface_pressure                 : num [1:8760] 1008 1009 1008 1004 1003 ...
##   ..$ precipitation                    : num [1:8760] 0 0 0.7 0 0 0.3 4.2 3.7 1.1 0 ...
##   ..$ rain                             : num [1:8760] 0 0 0.7 0 0 0.3 4.2 3.7 1.1 0 ...
##   ..$ snowfall                         : num [1:8760] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ cloudcover                       : int [1:8760] 100 100 100 100 100 100 100 100 100 48 ...
##   ..$ cloudcover_low                   : int [1:8760] 93 91 95 83 71 100 100 100 100 12 ...
##   ..$ cloudcover_mid                   : int [1:8760] 71 93 73 100 100 100 100 100 97 12 ...
##   ..$ cloudcover_high                  : int [1:8760] 88 97 100 100 100 100 7 100 97 100 ...
##   ..$ shortwave_radiation              : num [1:8760] 0 0 0 78 127 2 0 0 0 55 ...
##   ..$ direct_radiation                 : num [1:8760] 0 0 0 0 4 0 0 0 0 3 ...
##   ..$ direct_normal_irradiance         : num [1:8760] 0 0 0 0 9.2 0 0 0 0 17.6 ...
##   ..$ diffuse_radiation                : num [1:8760] 0 0 0 78 123 2 0 0 0 52 ...
##   ..$ windspeed_10m                    : num [1:8760] 8 6.9 4.3 3.6 5.6 6.8 7.6 8 11 10.8 ...
##   ..$ windspeed_100m                   : num [1:8760] 14.2 11.5 7.9 5.7 7.9 8.4 19.3 16.3 22.6 21.5 ...
##   ..$ winddirection_10m                : int [1:8760] 190 186 185 6 333 18 19 18 341 143 ...
##   ..$ winddirection_100m               : int [1:8760] 187 180 180 325 24 31 14 5 343 145 ...
##   ..$ windgusts_10m                    : num [1:8760] 12.6 13.3 12.6 17.3 18.4 3.6 19.8 13.3 16.2 22.7 ...
##   ..$ et0_fao_evapotranspiration       : num [1:8760] 0 0 0 0.05 0.07 0.01 0 0 0 0.04 ...
##   ..$ weathercode                      : int [1:8760] 3 3 53 3 3 51 63 63 55 1 ...
##   ..$ vapor_pressure_deficit           : num [1:8760] 0.03 0.03 0.02 0.05 0.1 0.01 0.01 0.02 0.03 0.16 ...
##   ..$ soil_temperature_0_to_7cm        : num [1:8760] 8.2 8.1 8.2 9.7 10.3 9.7 9.3 9.2 9.2 11.2 ...
##   ..$ soil_temperature_7_to_28cm       : num [1:8760] 8 8 8 8.3 8.5 8.8 8.9 8.9 8.9 9.8 ...
##   ..$ soil_temperature_28_to_100cm     : num [1:8760] 7.6 7.6 7.6 7.8 7.8 7.9 7.9 8 8 8.2 ...
##   ..$ soil_temperature_100_to_255cm    : num [1:8760] 12.2 12.2 12.2 12.1 12.1 12.1 12.1 12.1 12.1 12 ...
##   ..$ soil_moisture_0_to_7cm           : num [1:8760] 0.388 0.388 0.393 0.418 0.414 0.423 0.439 0.431 0.439 0.402 ...
##   ..$ soil_moisture_7_to_28cm          : num [1:8760] 0.397 0.397 0.396 0.4 0.401 0.405 0.43 0.433 0.433 0.417 ...
##   ..$ soil_moisture_28_to_100cm        : num [1:8760] 0.381 0.381 0.381 0.381 0.382 0.382 0.383 0.391 0.392 0.4 ...
##   ..$ soil_moisture_100_to_255cm       : num [1:8760] 0.378 0.378 0.378 0.378 0.378 0.378 0.378 0.378 0.378 0.378 ...
##   ..$ origTime                         : chr [1:8760] "2022-01-01T01:00" "2022-01-01T02:00" "2022-01-01T03:00" "2022-01-01T13:00" ...
##   ..$ month                            : Factor w/ 12 levels "Jan","Feb","Mar",..: 1 1 1 1 1 1 1 1 1 1 ...
##   ..$ fct_hour                         : Factor w/ 24 levels "0","1","2","3",..: 2 3 4 14 15 19 24 2 3 18 ...
##   ..$ pct_hour                         : num [1:8760] 4 8 13 54 58 75 96 4 8 71 ...
##   ..$ pct_temperature_2m               : num [1:8760] 40 39 40 47 49 45 43 42 42 46 ...
##   ..$ pct_relativehumidity_2m          : num [1:8760] 97 97 99 94 88 99 99 97 96 75 ...
##   ..$ pct_dewpoint_2m                  : num [1:8760] 53 53 54 60 61 60 57 56 55 54 ...
##   ..$ pct_apparent_temperature         : num [1:8760] 44 44 45 52 53 50 47 46 45 48 ...
##   ..$ pct_pressure_msl                 : num [1:8760] 30 31 27 12 11 8 4 3 2 8 ...
##   ..$ pct_surface_pressure             : num [1:8760] 29 31 27 12 11 8 4 3 2 8 ...
##   ..$ pct_precipitation                : num [1:8760] 0 0 94 0 0 90 100 100 96 0 ...
##   ..$ pct_rain                         : num [1:8760] 0 0 95 0 0 91 100 100 96 0 ...
##   ..$ pct_snowfall                     : num [1:8760] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ pct_cloudcover                   : num [1:8760] 81 81 81 81 81 81 81 81 81 58 ...
##   ..$ pct_cloudcover_low               : num [1:8760] 86 86 87 84 81 91 91 91 91 63 ...
##   ..$ pct_cloudcover_mid               : num [1:8760] 79 87 79 94 94 94 94 94 89 52 ...
##   ..$ pct_cloudcover_high              : num [1:8760] 73 81 88 88 88 88 46 88 81 88 ...
##   ..$ pct_shortwave_radiation          : num [1:8760] 0 0 0 60 64 47 0 0 0 57 ...
##   ..$ pct_direct_radiation             : num [1:8760] 0 0 0 0 56 0 0 0 0 55 ...
##   ..$ pct_direct_normal_irradiance     : num [1:8760] 0 0 0 0 54 0 0 0 0 56 ...
##   ..$ pct_diffuse_radiation            : num [1:8760] 0 0 0 68 82 48 0 0 0 61 ...
##   ..$ pct_windspeed_10m                : num [1:8760] 30 22 7 4 13 21 27 30 54 53 ...
##   ..$ pct_windspeed_100m               : num [1:8760] 30 20 9 5 9 11 52 38 66 61 ...
##   ..$ pct_winddirection_10m            : num [1:8760] 38 37 37 1 93 3 4 3 95 26 ...
##   ..$ pct_winddirection_100m           : num [1:8760] 37 35 35 90 5 6 3 1 96 27 ...
##   ..$ pct_windgusts_10m                : num [1:8760] 11 13 11 29 34 0 41 13 25 52 ...
##   ..$ pct_et0_fao_evapotranspiration   : num [1:8760] 0 0 0 53 61 22 0 0 0 48 ...
##   ..$ pct_weathercode                  : num [1:8760] 69 69 92 69 69 86 97 97 94 36 ...
##   ..$ pct_vapor_pressure_deficit       : num [1:8760] 2 2 1 4 14 0 0 1 2 26 ...
##   ..$ pct_soil_temperature_0_to_7cm    : num [1:8760] 37 37 37 41 43 41 40 40 40 45 ...
##   ..$ pct_soil_temperature_7_to_28cm   : num [1:8760] 35 35 35 36 37 38 38 38 38 41 ...
##   ..$ pct_soil_temperature_28_to_100cm : num [1:8760] 31 31 31 32 32 33 33 33 33 34 ...
##   ..$ pct_soil_temperature_100_to_255cm: num [1:8760] 49 49 49 48 48 48 48 48 48 48 ...
##   ..$ pct_soil_moisture_0_to_7cm       : num [1:8760] 82 82 85 96 95 97 100 99 100 90 ...
##   ..$ pct_soil_moisture_7_to_28cm      : num [1:8760] 86 86 85 88 89 91 99 100 100 97 ...
##   ..$ pct_soil_moisture_28_to_100cm    : num [1:8760] 69 69 69 69 69 69 70 78 79 88 ...
##   ..$ pct_soil_moisture_100_to_255cm   : num [1:8760] 51 51 51 51 51 51 51 51 51 51 ...
##   ..$ pct_0005                         : int [1:8760] 0 5 3 2 5 3 2 4 4 5 ...
##   ..$ pct_0025                         : int [1:8760] 10 1 17 5 0 17 13 3 5 5 ...
##   ..$ pct_0100                         : int [1:8760] 83 58 19 95 23 92 5 69 36 76 ...
##   ..$ pct_0250                         : int [1:8760] 125 18 126 116 191 166 200 1 15 231 ...
##   ..$ pct_0500                         : int [1:8760] 427 344 218 133 186 290 333 129 465 319 ...
##   ..$ pct_1000                         : int [1:8760] 916 180 999 46 239 897 756 157 259 59 ...
##   ..$ pct_2500                         : int [1:8760] 1934 1910 1837 1519 2238 912 1747 1417 1977 1 ...
##   ..$ pct_5000                         : int [1:8760] 430 2032 1112 1432 1565 3262 124 786 475 2732 ...
##   ..$ tod                              : Factor w/ 2 levels "Day","Night": 2 2 2 1 1 1 2 2 2 1 ...
##   ..$ season                           : Factor w/ 4 levels "Spring","Summer",..: 4 4 4 4 4 4 4 4 4 4 ...
##   ..$ todSeason                        : Factor w/ 8 levels "Spring-Day","Spring-Night",..: 8 8 8 7 7 7 8 8 8 7 ...
##   ..$ pred                             : Factor w/ 12 levels "Jan","Feb","Mar",..: 12 12 12 12 12 12 12 12 12 12 ...
##  $ rfAcc  : num 0.834

The function is run for hour as a factor:

dfTrain <- filter(nycTempTrain, lubridate::year(date)<2022)
dfTest <- filter(bind_rows(nycTempTest, nycTempTrain), lubridate::year(date)==2022)
keyLabel <- "predictions based on pre-2022 training data applied to 2022 holdout dataset"

runFullRF(dfTrain=dfTrain, 
          yVar="fct_hour", 
          xVars=varsTrain, 
          dfTest=dfTest, 
          useLabel=keyLabel, 
          useSub=stringr::str_to_sentence(keyLabel)
          )
## Growing trees.. Progress: 82%. Estimated remaining time: 6 seconds.

## 
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 39.966%

Function reportAccuracy() is updated for continuous variables:

# Backup of old function
oldReportAccuracy <- reportAccuracy

# Update for continuous variables
reportAccuracy <- function(df, 
                           trueCol, 
                           predCol="pred", 
                           reportAcc=TRUE, 
                           rndReport=2, 
                           useLabel="requested data",
                           returnAcc=!isTRUE(reportAcc), 
                           reportR2=FALSE
                           ) {
    
    # FUNCTION ARGUMENTS:
    # df: data frame containing actual and predictions
    # trueCol: column containing true value
    # predCol: column containing predicted value
    # reportAcc: boolean, should accuracy be reported (printed to output)?
    # rndReport: number of significant digits for reporting (will be converted to percentage first)
    # useLabel: label for data to be used in reporting
    # returnAcc: boolean, should the accuracy be returned 
    #            return value is not converted to percentage, not rounded
    # reportR2: boolean, should accuracy be calculated as R-squared?
    #           (default FALSE measures as categorical)
    
    # Continuous or categorical reporting
    if(isTRUE(reportR2)) {
        tc <- df %>% pull(get(trueCol))
        pc <- df %>% pull(get(predCol))
        mseNull <- mean((tc-mean(tc))**2)
        msePred <- mean((tc-pc)**2)
        r2 <- 1 - msePred/mseNull
        if(isTRUE(reportAcc)) 
            cat("\nR-squared of ", 
                useLabel, 
                " is: ", 
                round(100*r2, rndReport), 
                "% (RMSE ",
                round(sqrt(msePred), 2), 
                " vs. ", 
                round(sqrt(mseNull), 2),
                " null)\n", 
                sep=""
                )
        acc <- c("mseNull"=mseNull, "msePred"=msePred, "r2"=r2)
    } else {
        acc <- mean(df[trueCol]==df[predCol])
        if(isTRUE(reportAcc)) 
            cat("\nAccuracy of ", useLabel, " is: ", round(100*acc, rndReport), "%\n", sep="")    
    }
    
    # Return accuracy statistic if requested
    if(isTRUE(returnAcc)) return(acc)
    
}

# Test for categorical
set.seed(23121100)
dfTestCat <- tibble::tibble(trueVal=factor(sample(month.abb, 1000, replace=TRUE), levels=month.abb),
                            predNull=factor(sample(month.abb, 1000, replace=TRUE), levels=month.abb), 
                            predVal=factor(month.abb[as.integer(ifelse(runif(1000)<=0.9, trueVal, predNull))], 
                                           levels=month.abb
                                           )
                            )
dfTestCat
## # A tibble: 1,000 × 3
##    trueVal predNull predVal
##    <fct>   <fct>    <fct>  
##  1 Oct     May      Oct    
##  2 Sep     Nov      Sep    
##  3 Dec     Feb      Dec    
##  4 Oct     Nov      Oct    
##  5 Nov     Feb      Nov    
##  6 Nov     Nov      Nov    
##  7 Aug     Jan      Aug    
##  8 Feb     Dec      Feb    
##  9 Jun     Jun      Jun    
## 10 Oct     Sep      Oct    
## # ℹ 990 more rows
reportAccuracy(dfTestCat, trueCol="trueVal", predCol="predNull", useLabel="random 12-category null")
## 
## Accuracy of random 12-category null is: 8.6%
oldReportAccuracy(dfTestCat, trueCol="trueVal", predCol="predNull", useLabel="random 12-category null")
## 
## Accuracy of random 12-category null is: 8.6%
reportAccuracy(dfTestCat, trueCol="trueVal", predCol="predVal", useLabel="random 12-category (90% true)")
## 
## Accuracy of random 12-category (90% true) is: 91.2%
oldReportAccuracy(dfTestCat, trueCol="trueVal", predCol="predVal", useLabel="random 12-category (90% true)")
## 
## Accuracy of random 12-category (90% true) is: 91.2%
# Test for continuous
set.seed(23121100)
dfTestCont <- tibble::tibble(trueVal=rnorm(100000),
                             predNull=mean(trueVal), 
                             predVal=ifelse(runif(100000)<=0.9, trueVal, predNull)
                             )
dfTestCont
## # A tibble: 100,000 × 3
##    trueVal predNull predVal
##      <dbl>    <dbl>   <dbl>
##  1   0.551  0.00113   0.551
##  2   0.872  0.00113   0.872
##  3   1.40   0.00113   1.40 
##  4   0.116  0.00113   0.116
##  5   0.726  0.00113   0.726
##  6  -0.969  0.00113  -0.969
##  7  -2.10   0.00113  -2.10 
##  8   0.540  0.00113   0.540
##  9  -0.115  0.00113  -0.115
## 10   1.32   0.00113   1.32 
## # ℹ 99,990 more rows
reportAccuracy(dfTestCont, 
               trueCol="trueVal", 
               predCol="predNull", 
               useLabel="continuous rnorm null", 
               reportR2=TRUE
               )
## 
## R-squared of continuous rnorm null is: 0% (RMSE 1 vs. 1 null)
reportAccuracy(dfTestCont, 
               trueCol="trueVal", 
               predCol="predVal", 
               useLabel="continuous rnorm (90% true)", 
               reportR2=TRUE
               )
## 
## R-squared of continuous rnorm (90% true) is: 89.73% (RMSE 0.32 vs. 1 null)

Function plotConfusion() is updated for continuous variables:

# Backup of old function
oldPlotConfusion <- plotConfusion

# Update for continuous variables
plotConfusion <- function(df, 
                          trueCol, 
                          predCol="pred", 
                          useTitle=NULL,
                          useSub=NULL, 
                          plotCont=FALSE
                          ) {
    
    # FUNCTION ARGUMENTS:
    # df: data frame containing actual and predictions
    # trueCol: column containing true value
    # predCol: column containing predicted value
    # useTitle: title to be used for chart (NULL means create from trueCol)
    # useSub: subtitle to be used for chart (NULL means none)
    # plotCont: boolean, should plotting assume continuous variables?
    #           (default FALSE assumes confusion plot for categorical variables)
    
    # Create title if not supplied
    if(is.null(useTitle)) useTitle <- paste0("Predicting ", trueCol)

    # Create base plot (applicable to categorical or continuous variables)
    p1 <- df %>%
        group_by(across(all_of(c(trueCol, predCol)))) %>%
        summarize(n=n(), .groups="drop") %>%
        ggplot(aes(x=get(predCol), y=get(trueCol))) + 
        labs(x="Predicted", y="Actual", title=useTitle, subtitle=useSub)
        
    # Update plot as appropriate
    if(isTRUE(plotCont)) {
        p1 <- p1 +
            geom_point(aes(size=n), alpha=0.5) + 
            scale_size_continuous("# Obs") +
            coord_flip() +
            geom_smooth(aes(weight=n), method="lm")
    } else {
        p1 <- p1 + 
            geom_tile(aes(fill=n)) + 
            geom_text(aes(label=n), size=2.5) +
            scale_fill_continuous("", low="white", high="green")
    }
    
    # Output plot
    print(p1)
    
}

# Test for categorical (use same data as above)
plotConfusion(dfTestCat, trueCol="trueVal", predCol="predNull")

oldPlotConfusion(dfTestCat, trueCol="trueVal", predCol="predNull")

plotConfusion(dfTestCat, trueCol="trueVal", predCol="predVal")

oldPlotConfusion(dfTestCat, trueCol="trueVal", predCol="predVal")

# Test for continuous (use same data as above, all data rounded to nearest 0.05)
dfTestCont05 <- dfTestCont %>% mutate(across(where(is.numeric), .fns=function(x) round(x*20)/20))
dfTestCont05
## # A tibble: 100,000 × 3
##    trueVal predNull predVal
##      <dbl>    <dbl>   <dbl>
##  1    0.55        0    0.55
##  2    0.85        0    0.85
##  3    1.4         0    1.4 
##  4    0.1         0    0.1 
##  5    0.75        0    0.75
##  6   -0.95        0   -0.95
##  7   -2.1         0   -2.1 
##  8    0.55        0    0.55
##  9   -0.1         0   -0.1 
## 10    1.3         0    1.3 
## # ℹ 99,990 more rows
plotConfusion(dfTestCont05, trueCol="trueVal", predCol="predNull", plotCont=TRUE)
## `geom_smooth()` using formula = 'y ~ x'

plotConfusion(dfTestCont05, trueCol="trueVal", predCol="predVal", plotCont=TRUE)
## `geom_smooth()` using formula = 'y ~ x'

The main function is updated to run all steps for continuous or categorical:

oldRunFullRF <- runFullRF

runFullRF <- function(dfTrain, 
                      yVar, 
                      xVars, 
                      dfTest=dfTrain,
                      useLabel="test data",
                      useSub=NULL, 
                      isContVar=FALSE,
                      returnData=FALSE, 
                      ...
                      ) {
    
    # FUNCTION ARGUMENTS:
    # dfTrain: training data
    # yVar: dependent variable
    # xVars: column(s) containing independent variables
    # dfTest: test dataset for applying predictions
    # useLabel: label to be used for reporting accuracy
    # useSub: subtitle to be used for confusion chart (NULL means none)
    # isContVar: boolean, is the variable continuous? (default FALSE means categorical)
    # returnData: boolean, should data be returned?
    # ...: additional parameters to pass to runSimpleRF(), which are then passed to ranger::ranger()

    # 1. Run random forest using impurity for importance
    rf <- runSimpleRF(df=dfTrain, yVar=yVar, xVars=xVars, importance="impurity", ...)

    # 2. Plot variable importance
    rfImp <- plotRFImportance(rf, returnData=TRUE)

    # 3. Predict on test dataset
    tstPred <- predictRF(rf=rf, df=dfTest)

    # 4. Report on accuracy (updated for continuous or categorical)
    rfAcc <- reportAccuracy(tstPred, 
                            trueCol=yVar, 
                            rndReport=3, 
                            useLabel=useLabel, 
                            reportR2=isTRUE(isContVar),
                            returnAcc=TRUE
                            )

    # 5. Plot confusion data (updated for continuous vs. categorical)
    plotConfusion(tstPred, trueCol=yVar, useSub=useSub, plotCont=isTRUE(isContVar))
    
    #6. Return data if requested
    if(isTRUE(returnData)) return(list(rf=rf, rfImp=rfImp, tstPred=tstPred, rfAcc=rfAcc))
    
}

The new function is tested on categorical data (default):

runFullRF(dfTrain=dfTrain, 
          yVar="month", 
          xVars=varsTrain, 
          dfTest=dfTest, 
          useLabel=keyLabel, 
          useSub=stringr::str_to_sentence(keyLabel)
          )

## 
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 82.911%

The new function is tested on continuous data (new):

tmp2m <- runFullRF(dfTrain=dfTrain, 
                   yVar="temperature_2m", 
                   xVars=varsTrain[!str_detect(varsTrain, "temper")], 
                   dfTest=dfTest, 
                   useLabel=keyLabel, 
                   useSub=stringr::str_to_sentence(keyLabel), 
                   isContVar = TRUE, 
                   returnData=TRUE
                   )

## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 99.525% (RMSE 0.71 vs. 10.3 null)
## `geom_smooth()` using formula = 'y ~ x'

The new function is tested excluding dewpoint:

tmp2m$rfImp %>% 
    arrange(-imp)
## # A tibble: 26 × 2
##    metric                              imp
##    <chr>                             <dbl>
##  1 pct_dewpoint_2m                3234810.
##  2 pct_vapor_pressure_deficit     1012362.
##  3 pct_soil_moisture_7_to_28cm     906531.
##  4 pct_soil_moisture_0_to_7cm      707109.
##  5 pct_soil_moisture_28_to_100cm   628903.
##  6 pct_et0_fao_evapotranspiration  364899.
##  7 pct_relativehumidity_2m         256441.
##  8 pct_soil_moisture_100_to_255cm  171792.
##  9 pct_pressure_msl                125768.
## 10 pct_diffuse_radiation           102166.
## # ℹ 16 more rows
runFullRF(dfTrain=dfTrain, 
          yVar="temperature_2m", 
          xVars=varsTrain[!str_detect(varsTrain, "temper|dew")], 
          dfTest=dfTest, 
          useLabel=keyLabel, 
          useSub=stringr::str_to_sentence(keyLabel), 
          isContVar = TRUE
          )

## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 94.047% (RMSE 2.51 vs. 10.3 null)
## `geom_smooth()` using formula = 'y ~ x'

Function plotConfusion() is updated to aggregate (round) continuous variables:

# Update for continuous variables
plotConfusion <- function(df, 
                          trueCol, 
                          predCol="pred", 
                          useTitle=NULL,
                          useSub=NULL, 
                          plotCont=FALSE, 
                          rndTo=NULL
                          ) {
    
    # FUNCTION ARGUMENTS:
    # df: data frame containing actual and predictions
    # trueCol: column containing true value
    # predCol: column containing predicted value
    # useTitle: title to be used for chart (NULL means create from trueCol)
    # useSub: subtitle to be used for chart (NULL means none)
    # plotCont: boolean, should plotting assume continuous variables?
    #           (default FALSE assumes confusion plot for categorical variables)
    # rndTo: number for rounding continuous variables (NULL means no rounding)
    
    # Create title if not supplied
    if(is.null(useTitle)) useTitle <- paste0("Predicting ", trueCol)

    # Round data if requested
    if(!is.null(rndTo)) {
        df <- df %>%
            mutate(across(all_of(c(trueCol, predCol)), .fns=function(x) round(x/rndTo)*rndTo))
    }
    
    # Create base plot (applicable to categorical or continuous variables)
    p1 <- df %>%
        group_by(across(all_of(c(trueCol, predCol)))) %>%
        summarize(n=n(), .groups="drop") %>%
        ggplot(aes(x=get(predCol), y=get(trueCol))) + 
        labs(x="Predicted", y="Actual", title=useTitle, subtitle=useSub)
        
    # Update plot as appropriate
    if(isTRUE(plotCont)) {
        p1 <- p1 +
            geom_point(aes(size=n), alpha=0.5) + 
            scale_size_continuous("# Obs") +
            coord_flip() +
            geom_smooth(aes(weight=n), method="lm")
    } else {
        p1 <- p1 + 
            geom_tile(aes(fill=n)) + 
            geom_text(aes(label=n), size=2.5) +
            scale_fill_continuous("", low="white", high="green")
    }
    
    # Output plot
    print(p1)
    
}

# Test for continuous (use same data as above, all data rounded to nearest 0.05)
plotConfusion(dfTestCont05, trueCol="trueVal", predCol="predVal", plotCont=TRUE)
## `geom_smooth()` using formula = 'y ~ x'

# Run function on non-rounded data
plotConfusion(dfTestCont, trueCol="trueVal", predCol="predVal", plotCont=TRUE, rndTo=0.05)
## `geom_smooth()` using formula = 'y ~ x'

The main function is updated to include rounding:

runFullRF <- function(dfTrain, 
                      yVar, 
                      xVars, 
                      dfTest=dfTrain,
                      useLabel="test data",
                      useSub=NULL, 
                      isContVar=FALSE,
                      rndTo=NULL,
                      returnData=FALSE, 
                      ...
                      ) {
    
    # FUNCTION ARGUMENTS:
    # dfTrain: training data
    # yVar: dependent variable
    # xVars: column(s) containing independent variables
    # dfTest: test dataset for applying predictions
    # useLabel: label to be used for reporting accuracy
    # useSub: subtitle to be used for confusion chart (NULL means none)
    # isContVar: boolean, is the variable continuous? (default FALSE means categorical)
    # rndTo: round x and y to the nearest rndTo prior to counting examples and plotting accuracy 
    #        (NULL means categorical data or requests no rounding of continuous data)
    # returnData: boolean, should data be returned?
    # ...: additional parameters to pass to runSimpleRF(), which are then passed to ranger::ranger()

    # 1. Run random forest using impurity for importance
    rf <- runSimpleRF(df=dfTrain, yVar=yVar, xVars=xVars, importance="impurity", ...)

    # 2. Plot variable importance
    rfImp <- plotRFImportance(rf, returnData=TRUE)

    # 3. Predict on test dataset
    tstPred <- predictRF(rf=rf, df=dfTest)

    # 4. Report on accuracy (updated for continuous or categorical)
    rfAcc <- reportAccuracy(tstPred, 
                            trueCol=yVar, 
                            rndReport=3, 
                            useLabel=useLabel, 
                            reportR2=isTRUE(isContVar),
                            returnAcc=TRUE
                            )

    # 5. Plot confusion data (updated for continuous vs. categorical)
    plotConfusion(tstPred, trueCol=yVar, useSub=useSub, plotCont=isTRUE(isContVar), rndTo=rndTo)
    
    #6. Return data if requested
    if(isTRUE(returnData)) return(list(rf=rf, rfImp=rfImp, tstPred=tstPred, rfAcc=rfAcc))
    
}

The new function is tested for temperature, excluding predictors that are temperature or dewpoint:

runFullRF(dfTrain=dfTrain, 
          yVar="temperature_2m", 
          xVars=varsTrain[!str_detect(varsTrain, "temper|dew")], 
          dfTest=dfTest, 
          useLabel=keyLabel, 
          useSub=stringr::str_to_sentence(keyLabel), 
          isContVar = TRUE, 
          rndTo=1
          )

## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 94.054% (RMSE 2.51 vs. 10.3 null)
## `geom_smooth()` using formula = 'y ~ x'

For comparison, a model is run with access only to month and hour, then with only access to dewpoint:

runFullRF(dfTrain=dfTrain, 
          yVar="temperature_2m", 
          xVars=c("month", "fct_hour"), 
          dfTest=dfTest, 
          useLabel=keyLabel, 
          useSub=stringr::str_to_sentence(keyLabel), 
          isContVar = TRUE, 
          rndTo=1
          )

## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 79.39% (RMSE 4.68 vs. 10.3 null)
## `geom_smooth()` using formula = 'y ~ x'

runFullRF(dfTrain=dfTrain, 
          yVar="temperature_2m", 
          xVars=c("dewpoint_2m"), 
          dfTest=dfTest, 
          useLabel=keyLabel, 
          useSub=stringr::str_to_sentence(keyLabel), 
          isContVar = TRUE, 
          rndTo=1
          )

## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 83.594% (RMSE 4.17 vs. 10.3 null)
## `geom_smooth()` using formula = 'y ~ x'

runFullRF(dfTrain=dfTrain, 
          yVar="temperature_2m", 
          xVars=c("month", "fct_hour", "dewpoint_2m"), 
          dfTest=dfTest, 
          useLabel=keyLabel, 
          useSub=stringr::str_to_sentence(keyLabel), 
          isContVar = TRUE, 
          rndTo=1
          )

## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 94.298% (RMSE 2.46 vs. 10.3 null)
## `geom_smooth()` using formula = 'y ~ x'

Access to either 1) both month and hour, or 2) dewpoint reduces RMSE from ~10 in the null model to between 4-5. Including all 3 further reduces RMSE to ~2.5

Function plotConfusion() is updated to optionally add a y=x reference line:

# Update for continuous variables
plotConfusion <- function(df, 
                          trueCol, 
                          predCol="pred", 
                          useTitle=NULL,
                          useSub=NULL, 
                          plotCont=FALSE, 
                          rndTo=NULL,
                          refXY=FALSE
                          ) {
    
    # FUNCTION ARGUMENTS:
    # df: data frame containing actual and predictions
    # trueCol: column containing true value
    # predCol: column containing predicted value
    # useTitle: title to be used for chart (NULL means create from trueCol)
    # useSub: subtitle to be used for chart (NULL means none)
    # plotCont: boolean, should plotting assume continuous variables?
    #           (default FALSE assumes confusion plot for categorical variables)
    # rndTo: number for rounding continuous variables (NULL means no rounding)
    # refXY: boolean, should a reference line for y=x be included? (relevant only for continuous)
    
    # Create title if not supplied
    if(is.null(useTitle)) useTitle <- paste0("Predicting ", trueCol)

    # Round data if requested
    if(!is.null(rndTo)) {
        df <- df %>%
            mutate(across(all_of(c(trueCol, predCol)), .fns=function(x) round(x/rndTo)*rndTo))
    }
    
    # Create base plot (applicable to categorical or continuous variables)
    # Use x as true and y as predicted, for more meaningful geom_smooth() if continuous
    # Flip coordinates if categorical
    p1 <- df %>%
        group_by(across(all_of(c(trueCol, predCol)))) %>%
        summarize(n=n(), .groups="drop") %>%
        ggplot(aes(y=get(predCol), x=get(trueCol))) + 
        labs(y="Predicted", x="Actual", title=useTitle, subtitle=useSub)
        
    # Update plot as appropriate
    if(isTRUE(plotCont)) {
        p1 <- p1 +
            geom_point(aes(size=n), alpha=0.5) + 
            scale_size_continuous("# Obs") +
            geom_smooth(aes(weight=n), method="lm")
        if(isTRUE(refXY)) p1 <- p1 + geom_abline(slope=1, intercept=0, lty=2, color="red")
    } else {
        p1 <- p1 + 
            geom_tile(aes(fill=n)) + 
            geom_text(aes(label=n), size=2.5) +
            coord_flip() +
            scale_fill_continuous("", low="white", high="green")
    }
    
    # Output plot
    print(p1)
    
}

# Add reference line
plotConfusion(dfTestCont, trueCol="trueVal", predCol="predVal", plotCont=TRUE, rndTo=0.05, refXY=TRUE)
## `geom_smooth()` using formula = 'y ~ x'

The main function is updated to include the optional reference line:

runFullRF <- function(dfTrain, 
                      yVar, 
                      xVars, 
                      dfTest=dfTrain,
                      useLabel="test data",
                      useSub=NULL, 
                      isContVar=FALSE,
                      rndTo=NULL,
                      refXY=FALSE,
                      returnData=FALSE, 
                      ...
                      ) {
    
    # FUNCTION ARGUMENTS:
    # dfTrain: training data
    # yVar: dependent variable
    # xVars: column(s) containing independent variables
    # dfTest: test dataset for applying predictions
    # useLabel: label to be used for reporting accuracy
    # useSub: subtitle to be used for confusion chart (NULL means none)
    # isContVar: boolean, is the variable continuous? (default FALSE means categorical)
    # rndTo: round x and y to the nearest rndTo prior to counting examples and plotting accuracy 
    #        (NULL means categorical data or requests no rounding of continuous data)
    # refXY: boolean, should a reference line for y=x be included? (relevant only for continuous)
    # returnData: boolean, should data be returned?
    # ...: additional parameters to pass to runSimpleRF(), which are then passed to ranger::ranger()

    # 1. Run random forest using impurity for importance
    rf <- runSimpleRF(df=dfTrain, yVar=yVar, xVars=xVars, importance="impurity", ...)

    # 2. Plot variable importance
    rfImp <- plotRFImportance(rf, returnData=TRUE)

    # 3. Predict on test dataset
    tstPred <- predictRF(rf=rf, df=dfTest)

    # 4. Report on accuracy (updated for continuous or categorical)
    rfAcc <- reportAccuracy(tstPred, 
                            trueCol=yVar, 
                            rndReport=3, 
                            useLabel=useLabel, 
                            reportR2=isTRUE(isContVar),
                            returnAcc=TRUE
                            )

    # 5. Plot confusion data (updated for continuous vs. categorical)
    plotConfusion(tstPred, trueCol=yVar, useSub=useSub, plotCont=isTRUE(isContVar), rndTo=rndTo, refXY=refXY)
    
    #6. Return data if requested
    if(isTRUE(returnData)) return(list(rf=rf, rfImp=rfImp, tstPred=tstPred, rfAcc=rfAcc))
    
}

The updated function is again tested for temperature, excluding predictors that are temperature or dewpoint:

runFullRF(dfTrain=dfTrain, 
          yVar="temperature_2m", 
          xVars=varsTrain[!str_detect(varsTrain, "temper|dew")], 
          dfTest=dfTest, 
          useLabel=keyLabel, 
          useSub=stringr::str_to_sentence(keyLabel), 
          isContVar = TRUE, 
          rndTo=1, 
          refXY=TRUE
          )

## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 93.98% (RMSE 2.53 vs. 10.3 null)
## `geom_smooth()` using formula = 'y ~ x'

The updated function is re-run with access only to month and hour, then with only access to dewpoint:

runFullRF(dfTrain=dfTrain, 
          yVar="temperature_2m", 
          xVars=c("month", "fct_hour"), 
          dfTest=dfTest, 
          useLabel=keyLabel, 
          useSub=stringr::str_to_sentence(keyLabel), 
          isContVar = TRUE, 
          rndTo=1, 
          refXY=TRUE
          )

## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 79.307% (RMSE 4.69 vs. 10.3 null)
## `geom_smooth()` using formula = 'y ~ x'

runFullRF(dfTrain=dfTrain, 
          yVar="temperature_2m", 
          xVars=c("dewpoint_2m"), 
          dfTest=dfTest, 
          useLabel=keyLabel, 
          useSub=stringr::str_to_sentence(keyLabel), 
          isContVar = TRUE, 
          rndTo=1, 
          refXY=TRUE
          )

## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 83.594% (RMSE 4.17 vs. 10.3 null)
## `geom_smooth()` using formula = 'y ~ x'

runFullRF(dfTrain=dfTrain, 
          yVar="temperature_2m", 
          xVars=c("month", "fct_hour", "dewpoint_2m"), 
          dfTest=dfTest, 
          useLabel=keyLabel, 
          useSub=stringr::str_to_sentence(keyLabel), 
          isContVar = TRUE, 
          rndTo=1,
          refXY=TRUE
          )

## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 94.331% (RMSE 2.45 vs. 10.3 null)
## `geom_smooth()` using formula = 'y ~ x'

The updated function is tested for rainfall, excluding predictors that are rain, snow, or precipitation:

runFullRF(dfTrain=dfTrain, 
          yVar="rain", 
          xVars=varsTrain[!str_detect(varsTrain, "rain|snow|precip")], 
          dfTest=dfTest, 
          useLabel=keyLabel, 
          useSub=stringr::str_to_sentence(keyLabel), 
          isContVar = TRUE, 
          rndTo=0.1, 
          refXY=TRUE
          )

## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 57.071% (RMSE 0.3 vs. 0.45 null)
## `geom_smooth()` using formula = 'y ~ x'

The updated function is tested on month (categorical data by default):

runFullRF(dfTrain=dfTrain, 
          yVar="month", 
          xVars=varsTrain, 
          dfTest=dfTest, 
          useLabel=keyLabel, 
          useSub=stringr::str_to_sentence(keyLabel)
          )

## 
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 83.128%

Patterns in the deepest soil temperature are explored:

dfTrain %>%
    mutate(year=year(date), dom=day(date)) %>%
    filter(dom!=29 | month != "Feb") %>%
    select(year, month, dom, soil_temperature_100_to_255cm) %>%
    group_by(year, month, dom) %>%
    summarize(across(where(is.numeric), .fns=list(mean=mean, sd=sd)), .groups="drop") %>%
    pivot_longer(cols=-c(year, month, dom)) %>%
    mutate(newdate=lubridate::mdy(paste0(as.character(month), "-", zeroPad2(dom), "-2021"))) %>%
    ggplot(aes(x=newdate, y=value)) + 
    geom_line(aes(group=factor(year), color=factor(year))) + 
    facet_wrap(~name, ncol=1, scales="free_y") + 
    labs(x=NULL, y="Soil Temperature (C)", title="Soil Temperature by day and year") + 
    scale_color_discrete(NULL) +
    scale_x_date(labels=scales::date_format("%b"))

runFullRF(dfTrain=dfTrain, 
          yVar="soil_temperature_100_to_255cm", 
          xVars=c("month"), 
          dfTest=dfTest, 
          useLabel=keyLabel, 
          useSub=stringr::str_to_sentence(keyLabel), 
          isContVar = TRUE, 
          rndTo=1, 
          refXY=TRUE
          )

## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 96.474% (RMSE 1.1 vs. 5.84 null)
## `geom_smooth()` using formula = 'y ~ x'

Deep soil temperature varies gradually in a largely stable and repeatable annual pattern. There is very little intra-day fluctuation in deep soil temperature, and a model using month only achieves R-squared over 95%

Models using only day-of-year and month are explored:

runFullRF(dfTrain=dfTrain %>% mutate(doy=yday(date)), 
          yVar="soil_temperature_100_to_255cm", 
          xVars=c("doy"), 
          dfTest=dfTest %>% mutate(doy=yday(date)), 
          useLabel=keyLabel, 
          useSub=stringr::str_to_sentence(keyLabel), 
          isContVar = TRUE, 
          rndTo=.25, 
          refXY=TRUE
          )

## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 98.806% (RMSE 0.64 vs. 5.84 null)
## `geom_smooth()` using formula = 'y ~ x'

runFullRF(dfTrain=dfTrain %>% mutate(doy=yday(date)), 
          yVar="soil_temperature_100_to_255cm", 
          xVars=c("doy", "month"), 
          dfTest=dfTest %>% mutate(doy=yday(date)), 
          useLabel=keyLabel, 
          useSub=stringr::str_to_sentence(keyLabel), 
          isContVar = TRUE, 
          rndTo=.25, 
          refXY=TRUE
          )

## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 98.361% (RMSE 0.75 vs. 5.84 null)
## `geom_smooth()` using formula = 'y ~ x'

Including day-of-year cuts RMSE by ~40%, as variations in soil temperature from early-month to late-month can also be partially captured

Models including all variables except for day-of-year and month are explored:

rfSoilTemp100255 <- runFullRF(dfTrain=dfTrain %>% mutate(doy=yday(date)), 
                              yVar="soil_temperature_100_to_255cm", 
                              xVars=varsTrain[!str_detect(varsTrain, pattern="soil_temperature_100_to_255cm")], 
                              dfTest=dfTest %>% mutate(doy=yday(date)), 
                              useLabel=keyLabel, 
                              useSub=stringr::str_to_sentence(keyLabel), 
                              isContVar = TRUE, 
                              rndTo=.25, 
                              refXY=TRUE, 
                              returnData=TRUE
                              )

## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 93.625% (RMSE 1.48 vs. 5.84 null)
## `geom_smooth()` using formula = 'y ~ x'

Prediction accuracy is decreased, and anomalies introduced, as day-of-year (or its surrogate month) is removed, even as many other potential explanatory variables are added

Prediction vs. month is further explored:

rfSoilTemp100255$tstPred %>%
    mutate(across(.cols=c(soil_temperature_100_to_255cm, pred), .fns=function(x) round(x*2)/2)) %>% 
    count(month, soil_temperature_100_to_255cm, pred) %>% 
    ggplot(aes(y=soil_temperature_100_to_255cm, x=pred)) + 
    geom_point(aes(size=n)) + 
    geom_smooth(aes(weight=n), method="lm") + 
    geom_abline(slope=1, intercept=0, lty=2, color="red") + 
    facet_wrap(~month) + 
    labs(title="Prediction vs. Actual for Deep Soil Temperature (100 cm to 255 cm)", 
         x="Predicted", 
         y="Actual"
         )
## `geom_smooth()` using formula = 'y ~ x'

rfSoilTemp100255$tstPred %>%
    mutate(across(.cols=c(soil_temperature_100_to_255cm, pred), .fns=function(x) round(x*1)/1)) %>% 
    count(month, soil_temperature_100_to_255cm, pred) %>% 
    ggplot(aes(x=pred, y=n)) + 
    geom_col(aes(fill=month), position="fill") + 
    coord_flip() +
    labs(title="Predicted Deep Soil Temperature (100 cm to 255 cm) vs. Actual Month", 
         x="Predicted Deep Soil Temperature", 
         y="% Observations in Month"
         ) + 
    scale_fill_discrete("Month")

Prediction vs. actual is further explored:

rfSoilTemp100255$tstPred %>% 
    select(time, actual=soil_temperature_100_to_255cm, pred) %>% 
    pivot_longer(cols=-c(time)) %>% 
    ggplot(aes(x=time, y=value)) + 
    geom_line(aes(group=name, color=name)) + 
    labs(x=NULL, 
         y="Soil Temperature", 
         title="Predicted vs. Actual for Deep Soil Temperature (100 cm - 255 cm)"
         ) + 
    scale_color_discrete("Metric")

rfSoilTemp100255$tstPred %>% 
    select(date, month, actual=soil_temperature_100_to_255cm, pred) %>% 
    pivot_longer(cols=-c(date, month), names_to="metric") %>% 
    group_by(date, month, metric) %>%
    filter(n()>1) %>%
    summarize(mean=mean(value), sd=sd(value), .groups="drop") %>%
    pivot_longer(cols=-c(date, month, metric)) %>%
    ggplot(aes(x=month, y=value)) + 
    geom_boxplot(aes(fill=metric)) + 
    labs(x=NULL, 
         y=NULL, 
         title="Daily Metrics for Deep Soil Temperature (100 cm - 255 cm)"
         ) + 
    scale_fill_discrete("Metric") + 
    facet_wrap(~c("mean"="Daily Mean", "sd"="Daily Std. Deviation")[name], scales="free_y")

Predictions have a similar annual pattern as actual, with monthly means broadly aligned. Predictions are often variable over the course of a single day, while actual values change little (if at all) during a single day

The model including only day-of-year and month is explored for comparison:

rfSoilTemp255doy <- runFullRF(dfTrain=dfTrain %>% mutate(doy=yday(date)), 
                              yVar="soil_temperature_100_to_255cm", 
                              xVars=c("doy"), 
                              dfTest=dfTest %>% mutate(doy=yday(date)), 
                              useLabel=keyLabel, 
                              useSub=stringr::str_to_sentence(keyLabel), 
                              isContVar = TRUE, 
                              rndTo=.25, 
                              refXY=TRUE, 
                              returnData=TRUE
                              )

## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 98.806% (RMSE 0.64 vs. 5.84 null)
## `geom_smooth()` using formula = 'y ~ x'

rfSoilTemp255doy$tstPred %>%
    mutate(across(.cols=c(soil_temperature_100_to_255cm, pred), .fns=function(x) round(x*2)/2)) %>% 
    count(month, soil_temperature_100_to_255cm, pred) %>% 
    ggplot(aes(y=soil_temperature_100_to_255cm, x=pred)) + 
    geom_point(aes(size=n)) + 
    geom_smooth(aes(weight=n), method="lm") + 
    geom_abline(slope=1, intercept=0, lty=2, color="red") + 
    facet_wrap(~month) + 
    labs(title="Prediction vs. Actual for Deep Soil Temperature (100 cm to 255 cm)", 
         x="Predicted", 
         y="Actual"
         )
## `geom_smooth()` using formula = 'y ~ x'

rfSoilTemp255doy$tstPred %>%
    mutate(across(.cols=c(soil_temperature_100_to_255cm, pred), .fns=function(x) round(x*1)/1)) %>% 
    count(month, soil_temperature_100_to_255cm, pred) %>% 
    ggplot(aes(x=pred, y=n)) + 
    geom_col(aes(fill=month), position="fill") + 
    coord_flip() +
    labs(title="Predicted Deep Soil Temperature (100 cm to 255 cm) vs. Actual Month", 
         x="Predicted Deep Soil Temperature", 
         y="% Observations in Month"
         ) + 
    scale_fill_discrete("Month")

rfSoilTemp255doy$tstPred %>% 
    select(time, actual=soil_temperature_100_to_255cm, pred) %>% 
    pivot_longer(cols=-c(time)) %>% 
    ggplot(aes(x=time, y=value)) + 
    geom_line(aes(group=name, color=name)) + 
    labs(x=NULL, 
         y="Soil Temperature", 
         title="Predicted vs. Actual for Deep Soil Temperature (100 cm - 255 cm)"
         ) + 
    scale_color_discrete("Metric")

rfSoilTemp255doy$tstPred %>% 
    select(date, month, actual=soil_temperature_100_to_255cm, pred) %>% 
    pivot_longer(cols=-c(date, month), names_to="metric") %>% 
    group_by(date, month, metric) %>%
    filter(n()>1) %>%
    summarize(mean=mean(value), sd=sd(value), .groups="drop") %>%
    pivot_longer(cols=-c(date, month, metric)) %>%
    ggplot(aes(x=month, y=value)) + 
    geom_boxplot(aes(fill=metric)) + 
    labs(x=NULL, 
         y=NULL, 
         title="Daily Metrics for Deep Soil Temperature (100 cm - 255 cm)"
         ) + 
    scale_fill_discrete("Metric") + 
    facet_wrap(~c("mean"="Daily Mean", "sd"="Daily Std. Deviation")[name], scales="free_y")

Predictions using day-of-year have a much closer match to the seasonal pattern of the actual 2022 holdout data

Patterns in the deepest soil moisture are explored:

dfTrain %>%
    mutate(year=year(date), dom=day(date)) %>%
    filter(dom!=29 | month != "Feb") %>%
    select(year, month, dom, soil_moisture_100_to_255cm) %>%
    group_by(year, month, dom) %>%
    summarize(across(where(is.numeric), .fns=list(mean=mean, sd=sd)), .groups="drop") %>%
    pivot_longer(cols=-c(year, month, dom)) %>%
    mutate(newdate=lubridate::mdy(paste0(as.character(month), "-", zeroPad2(dom), "-2021"))) %>%
    ggplot(aes(x=newdate, y=value)) + 
    geom_line(aes(group=factor(year), color=factor(year))) + 
    facet_wrap(~name, ncol=1, scales="free_y") + 
    labs(x=NULL, y="Soil Moisture", title="Soil Moisture by day and year") + 
    scale_color_discrete(NULL) +
    scale_x_date(labels=scales::date_format("%b"))

runFullRF(dfTrain=dfTrain, 
          yVar="soil_moisture_100_to_255cm", 
          xVars=c("month"), 
          dfTest=dfTest, 
          useLabel=keyLabel, 
          useSub=stringr::str_to_sentence(keyLabel), 
          isContVar = TRUE, 
          rndTo=.0025, 
          refXY=TRUE
          )

## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 83.924% (RMSE 0.01 vs. 0.02 null)
## `geom_smooth()` using formula = 'y ~ x'

Deep soil moisture varies gradually, though not in a stable and repeatable annual pattern. There is very little intra-day fluctuation in deep soil temperature, and a model using month only achieves R-squared of around 80%

Models using only day-of-year and month are explored:

runFullRF(dfTrain=dfTrain %>% mutate(doy=yday(date)), 
          yVar="soil_moisture_100_to_255cm", 
          xVars=c("doy"), 
          dfTest=dfTest %>% mutate(doy=yday(date)), 
          useLabel=keyLabel, 
          useSub=stringr::str_to_sentence(keyLabel), 
          isContVar = TRUE, 
          rndTo=.0025, 
          refXY=TRUE
          )

## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 85.021% (RMSE 0.01 vs. 0.02 null)
## `geom_smooth()` using formula = 'y ~ x'

runFullRF(dfTrain=dfTrain %>% mutate(doy=yday(date)), 
          yVar="soil_moisture_100_to_255cm", 
          xVars=c("doy", "month"), 
          dfTest=dfTest %>% mutate(doy=yday(date)), 
          useLabel=keyLabel, 
          useSub=stringr::str_to_sentence(keyLabel), 
          isContVar = TRUE, 
          rndTo=.0025, 
          refXY=TRUE
          )

## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 84.788% (RMSE 0.01 vs. 0.02 null)
## `geom_smooth()` using formula = 'y ~ x'

In contrast to deep soil temperature, Including day-of-year makes no meaningful change in RMSE for predictions of deep soil moisture. Soil moisture patterns appear to be only somewhat seasonal, with different patterns by year

Models including all variables except for day-of-year and month are explored:

rfSoilMois100255 <- runFullRF(dfTrain=dfTrain %>% mutate(doy=yday(date)), 
                              yVar="soil_moisture_100_to_255cm", 
                              xVars=varsTrain[!str_detect(varsTrain, pattern="soil_moisture_100_to_255cm")], 
                              dfTest=dfTest %>% mutate(doy=yday(date)), 
                              useLabel=keyLabel, 
                              useSub=stringr::str_to_sentence(keyLabel), 
                              isContVar = TRUE, 
                              rndTo=.0025, 
                              refXY=TRUE, 
                              returnData=TRUE
                              )

## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 80.749% (RMSE 0.01 vs. 0.02 null)
## `geom_smooth()` using formula = 'y ~ x'

Prediction accuracy is similar even as day-of-year (or its surrogate month) is removed. Soil temperature and moisture (at shallower levels) are roughly as good predictors of deep soil moisture as month or day of year

Prediction vs. month is further explored:

rfSoilMois100255$tstPred %>%
    mutate(across(.cols=c(soil_moisture_100_to_255cm, pred), .fns=function(x) round(x*400)/400)) %>% 
    count(month, soil_moisture_100_to_255cm, pred) %>% 
    ggplot(aes(y=soil_moisture_100_to_255cm, x=pred)) + 
    geom_point(aes(size=n)) + 
    geom_smooth(aes(weight=n), method="lm") + 
    geom_abline(slope=1, intercept=0, lty=2, color="red") + 
    facet_wrap(~month) + 
    labs(title="Prediction vs. Actual for Deep Soil Moisture (100 cm to 255 cm)", 
         x="Predicted", 
         y="Actual"
         )
## `geom_smooth()` using formula = 'y ~ x'

rfSoilMois100255$tstPred %>%
    mutate(across(.cols=c(soil_moisture_100_to_255cm, pred), .fns=function(x) round(x*200)/200)) %>% 
    count(month, soil_moisture_100_to_255cm, pred) %>% 
    ggplot(aes(x=pred, y=n)) + 
    geom_col(aes(fill=month), position="fill") + 
    coord_flip() +
    labs(title="Predicted Deep Soil Moisture (100 cm to 255 cm) vs. Actual Month", 
         x="Predicted Deep Soil Moisture", 
         y="% Observations in Month"
         ) + 
    scale_fill_discrete("Month")

Prediction vs. actual is further explored:

rfSoilMois100255$tstPred %>% 
    select(time, actual=soil_moisture_100_to_255cm, pred) %>% 
    pivot_longer(cols=-c(time)) %>% 
    ggplot(aes(x=time, y=value)) + 
    geom_line(aes(group=name, color=name)) + 
    labs(x=NULL, 
         y="Soil Moisture", 
         title="Predicted vs. Actual for Deep Soil Moisture (100 cm - 255 cm)"
         ) + 
    scale_color_discrete("Metric")

rfSoilMois100255$tstPred %>% 
    select(date, month, actual=soil_moisture_100_to_255cm, pred) %>% 
    pivot_longer(cols=-c(date, month), names_to="metric") %>% 
    group_by(date, month, metric) %>%
    filter(n()>1) %>%
    summarize(mean=mean(value), sd=sd(value), .groups="drop") %>%
    pivot_longer(cols=-c(date, month, metric)) %>%
    ggplot(aes(x=month, y=value)) + 
    geom_boxplot(aes(fill=metric)) + 
    labs(x=NULL, 
         y=NULL, 
         title="Daily Metrics for Deep Soil Moisture (100 cm - 255 cm)"
         ) + 
    scale_fill_discrete("Metric") + 
    facet_wrap(~c("mean"="Daily Mean", "sd"="Daily Std. Deviation")[name], scales="free_y")

Predictions are often variable over the course of a single day, while actual values change little (if at all) during a single day. As well, while predictions follow a similar seasonal pattern as actual values, predicted and actual means differ meaningfully in several months

The model including only day-of-year and month is explored for comparison:

rfSoilMois255doy <- runFullRF(dfTrain=dfTrain %>% mutate(doy=yday(date)), 
                              yVar="soil_moisture_100_to_255cm", 
                              xVars=c("doy"), 
                              dfTest=dfTest %>% mutate(doy=yday(date)), 
                              useLabel=keyLabel, 
                              useSub=stringr::str_to_sentence(keyLabel), 
                              isContVar = TRUE, 
                              rndTo=.0025, 
                              refXY=TRUE, 
                              returnData=TRUE
                              )

## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 85.041% (RMSE 0.01 vs. 0.02 null)
## `geom_smooth()` using formula = 'y ~ x'

rfSoilMois255doy$tstPred %>%
    mutate(across(.cols=c(soil_moisture_100_to_255cm, pred), .fns=function(x) round(x*400)/400)) %>% 
    count(month, soil_moisture_100_to_255cm, pred) %>% 
    ggplot(aes(y=soil_moisture_100_to_255cm, x=pred)) + 
    geom_point(aes(size=n)) + 
    geom_smooth(aes(weight=n), method="lm") + 
    geom_abline(slope=1, intercept=0, lty=2, color="red") + 
    facet_wrap(~month) + 
    labs(title="Prediction vs. Actual for Deep Soil Moisture (100 cm to 255 cm)", 
         x="Predicted", 
         y="Actual"
         )
## `geom_smooth()` using formula = 'y ~ x'

rfSoilMois255doy$tstPred %>%
    mutate(across(.cols=c(soil_moisture_100_to_255cm, pred), .fns=function(x) round(x*400)/400)) %>% 
    count(month, soil_moisture_100_to_255cm, pred) %>% 
    ggplot(aes(x=pred, y=n)) + 
    geom_col(aes(fill=month), position="fill") + 
    coord_flip() +
    labs(title="Predicted Deep Soil Moisture (100 cm to 255 cm) vs. Actual Month", 
         x="Predicted Deep Soil Moisture", 
         y="% Observations in Month"
         ) + 
    scale_fill_discrete("Month")

rfSoilMois255doy$tstPred %>% 
    select(time, actual=soil_moisture_100_to_255cm, pred) %>% 
    full_join(rfSoilMois100255$tstPred %>% select(time, oldpred=pred), by="time") %>%
    pivot_longer(cols=-c(time)) %>% 
    ggplot(aes(x=time, y=value)) + 
    geom_line(aes(group=name, 
                  color=c("actual"="1. Actual", "oldpred"="2. Prev Predicted", "pred"="3. New Predicted")[name]
                  )
              ) + 
    labs(x=NULL, 
         y="Soil Moisture", 
         title="Predicted vs. Actual for Deep Soil Moisture (100 cm - 255 cm)"
         ) + 
    scale_color_discrete("Metric")

rfSoilMois255doy$tstPred %>% 
    select(date, month, actual=soil_moisture_100_to_255cm, pred) %>% 
    pivot_longer(cols=-c(date, month), names_to="metric") %>% 
    group_by(date, month, metric) %>%
    filter(n()>1) %>%
    summarize(mean=mean(value), sd=sd(value), .groups="drop") %>%
    pivot_longer(cols=-c(date, month, metric)) %>%
    ggplot(aes(x=month, y=value)) + 
    geom_boxplot(aes(fill=metric)) + 
    labs(x=NULL, 
         y=NULL, 
         title="Daily Metrics for Deep Soil Moisture (100 cm - 255 cm)"
         ) + 
    scale_fill_discrete("Metric") + 
    facet_wrap(~c("mean"="Daily Mean", "sd"="Daily Std. Deviation")[name], scales="free_y")

Both predictions generally match the seasonal pattern, but fail to capture unique elements of the holdout data

Function runFullRF() is updated to omit plots for variables importance and/or confusion matrix:

runFullRF <- function(dfTrain, 
                      yVar, 
                      xVars, 
                      dfTest=dfTrain,
                      useLabel="test data",
                      useSub=NULL, 
                      isContVar=FALSE,
                      rndTo=NULL,
                      refXY=FALSE,
                      makePlots=TRUE,
                      plotImp=makePlots,
                      plotConf=makePlots,
                      returnData=FALSE, 
                      ...
                      ) {
    
    # FUNCTION ARGUMENTS:
    # dfTrain: training data
    # yVar: dependent variable
    # xVars: column(s) containing independent variables
    # dfTest: test dataset for applying predictions
    # useLabel: label to be used for reporting accuracy
    # useSub: subtitle to be used for confusion chart (NULL means none)
    # isContVar: boolean, is the variable continuous? (default FALSE means categorical)
    # rndTo: round x and y to the nearest rndTo prior to counting examples and plotting accuracy 
    #        (NULL means categorical data or requests no rounding of continuous data)
    # refXY: boolean, should a reference line for y=x be included? (relevant only for continuous)
    # makePlots: boolean, should plots be created for variable importance and confusion matrix?
    # plotImp: boolean, should variable importance be plotted? (default is makePlots)
    # plotConf: boolean, should confusion matrix be plotted? (default is makePlots)
    # returnData: boolean, should data be returned?
    # ...: additional parameters to pass to runSimpleRF(), which are then passed to ranger::ranger()

    # 1. Run random forest using impurity for importance
    rf <- runSimpleRF(df=dfTrain, yVar=yVar, xVars=xVars, importance="impurity", ...)

    # 2. Create, and optionally plot, variable importance
    rfImp <- plotRFImportance(rf, plotData=plotImp, returnData=TRUE)

    # 3. Predict on test dataset
    tstPred <- predictRF(rf=rf, df=dfTest)

    # 4. Report on accuracy (updated for continuous or categorical)
    rfAcc <- reportAccuracy(tstPred, 
                            trueCol=yVar, 
                            rndReport=3, 
                            useLabel=useLabel, 
                            reportR2=isTRUE(isContVar),
                            returnAcc=TRUE
                            )

    # 5. Plot confusion data (updated for continuous vs. categorical) if requested
    if(isTRUE(plotConf)) {
        plotConfusion(tstPred, 
                      trueCol=yVar, 
                      useSub=useSub, 
                      plotCont=isTRUE(isContVar), 
                      rndTo=rndTo, 
                      refXY=refXY
                      )
    }
    
    #6. Return data if requested
    if(isTRUE(returnData)) return(list(rf=rf, rfImp=rfImp, tstPred=tstPred, rfAcc=rfAcc))
    
}

# Test function with and without plotting
runFullRF(dfTrain=dfTrain %>% mutate(doy=yday(date)), 
          yVar="soil_moisture_100_to_255cm", 
          xVars=c("doy", "month"), 
          dfTest=dfTest %>% mutate(doy=yday(date)), 
          useLabel=keyLabel, 
          useSub=stringr::str_to_sentence(keyLabel), 
          isContVar = TRUE, 
          rndTo=.0025, 
          refXY=TRUE, 
          returnData=TRUE
          )

## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 84.783% (RMSE 0.01 vs. 0.02 null)
## `geom_smooth()` using formula = 'y ~ x'

## $rf
## Ranger result
## 
## Call:
##  ranger::ranger(as.formula(paste0(yVar, "~", paste0(xVars, collapse = "+"))),      data = df[, c(yVar, xVars)], ...) 
## 
## Type:                             Regression 
## Number of trees:                  500 
## Sample size:                      78907 
## Number of independent variables:  2 
## Mtry:                             1 
## Target node size:                 5 
## Variable importance mode:         impurity 
## Splitrule:                        variance 
## OOB prediction error (MSE):       0.0002445276 
## R squared (OOB):                  0.5178946 
## 
## $rfImp
## # A tibble: 2 × 2
##   metric   imp
##   <chr>  <dbl>
## 1 doy     10.5
## 2 month   10.2
## 
## $tstPred
## # A tibble: 8,760 × 86
##    time                date        hour temperature_2m relativehumidity_2m
##    <dttm>              <date>     <int>          <dbl>               <int>
##  1 2022-01-01 01:00:00 2022-01-01     1            8.9                  98
##  2 2022-01-01 02:00:00 2022-01-01     2            8.8                  98
##  3 2022-01-01 03:00:00 2022-01-01     3            8.9                  99
##  4 2022-01-01 13:00:00 2022-01-01    13           11.3                  96
##  5 2022-01-01 14:00:00 2022-01-01    14           12.1                  93
##  6 2022-01-01 18:00:00 2022-01-01    18           10.8                  99
##  7 2022-01-01 23:00:00 2022-01-01    23            9.9                  99
##  8 2022-01-02 01:00:00 2022-01-02     1            9.7                  98
##  9 2022-01-02 02:00:00 2022-01-02     2            9.7                  97
## 10 2022-01-02 17:00:00 2022-01-02    17           11                    87
## # ℹ 8,750 more rows
## # ℹ 81 more variables: dewpoint_2m <dbl>, apparent_temperature <dbl>,
## #   pressure_msl <dbl>, surface_pressure <dbl>, precipitation <dbl>,
## #   rain <dbl>, snowfall <dbl>, cloudcover <int>, cloudcover_low <int>,
## #   cloudcover_mid <int>, cloudcover_high <int>, shortwave_radiation <dbl>,
## #   direct_radiation <dbl>, direct_normal_irradiance <dbl>,
## #   diffuse_radiation <dbl>, windspeed_10m <dbl>, windspeed_100m <dbl>, …
## 
## $rfAcc
##      mseNull      msePred           r2 
## 5.402235e-04 8.220417e-05 8.478330e-01
# Test function with and without plotting
runFullRF(dfTrain=dfTrain %>% mutate(doy=yday(date)), 
          yVar="soil_moisture_100_to_255cm", 
          xVars=c("doy", "month"), 
          dfTest=dfTest %>% mutate(doy=yday(date)), 
          useLabel=keyLabel, 
          useSub=stringr::str_to_sentence(keyLabel), 
          isContVar = TRUE, 
          rndTo=.0025, 
          refXY=TRUE, 
          makePlots=FALSE,
          returnData=TRUE
          )
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 84.812% (RMSE 0.01 vs. 0.02 null)
## $rf
## Ranger result
## 
## Call:
##  ranger::ranger(as.formula(paste0(yVar, "~", paste0(xVars, collapse = "+"))),      data = df[, c(yVar, xVars)], ...) 
## 
## Type:                             Regression 
## Number of trees:                  500 
## Sample size:                      78907 
## Number of independent variables:  2 
## Mtry:                             1 
## Target node size:                 5 
## Variable importance mode:         impurity 
## Splitrule:                        variance 
## OOB prediction error (MSE):       0.0002444528 
## R squared (OOB):                  0.5180421 
## 
## $rfImp
## # A tibble: 2 × 2
##   metric   imp
##   <chr>  <dbl>
## 1 doy    10.7 
## 2 month   9.98
## 
## $tstPred
## # A tibble: 8,760 × 86
##    time                date        hour temperature_2m relativehumidity_2m
##    <dttm>              <date>     <int>          <dbl>               <int>
##  1 2022-01-01 01:00:00 2022-01-01     1            8.9                  98
##  2 2022-01-01 02:00:00 2022-01-01     2            8.8                  98
##  3 2022-01-01 03:00:00 2022-01-01     3            8.9                  99
##  4 2022-01-01 13:00:00 2022-01-01    13           11.3                  96
##  5 2022-01-01 14:00:00 2022-01-01    14           12.1                  93
##  6 2022-01-01 18:00:00 2022-01-01    18           10.8                  99
##  7 2022-01-01 23:00:00 2022-01-01    23            9.9                  99
##  8 2022-01-02 01:00:00 2022-01-02     1            9.7                  98
##  9 2022-01-02 02:00:00 2022-01-02     2            9.7                  97
## 10 2022-01-02 17:00:00 2022-01-02    17           11                    87
## # ℹ 8,750 more rows
## # ℹ 81 more variables: dewpoint_2m <dbl>, apparent_temperature <dbl>,
## #   pressure_msl <dbl>, surface_pressure <dbl>, precipitation <dbl>,
## #   rain <dbl>, snowfall <dbl>, cloudcover <int>, cloudcover_low <int>,
## #   cloudcover_mid <int>, cloudcover_high <int>, shortwave_radiation <dbl>,
## #   direct_radiation <dbl>, direct_normal_irradiance <dbl>,
## #   diffuse_radiation <dbl>, windspeed_10m <dbl>, windspeed_100m <dbl>, …
## 
## $rfAcc
##      mseNull      msePred           r2 
## 5.402235e-04 8.204962e-05 8.481191e-01

The model is run for each continuous variable, using only month as a predictor:

vCont <- c('temperature_2m', 'relativehumidity_2m', 'dewpoint_2m', 'apparent_temperature', 
           'pressure_msl', 'surface_pressure', 
           'precipitation', 'rain', 'snowfall', 
           'cloudcover', 'cloudcover_low', 'cloudcover_mid', 'cloudcover_high', 
           'shortwave_radiation', 'direct_radiation', 'direct_normal_irradiance', 'diffuse_radiation',
           'windspeed_10m', 'windspeed_100m', 'winddirection_10m', 'winddirection_100m', 'windgusts_10m',
           'et0_fao_evapotranspiration', 'vapor_pressure_deficit', 
           'soil_temperature_0_to_7cm', 'soil_temperature_7_to_28cm', 
           'soil_temperature_28_to_100cm', 'soil_temperature_100_to_255cm', 
           'soil_moisture_0_to_7cm', 'soil_moisture_7_to_28cm', 
           'soil_moisture_28_to_100cm', 'soil_moisture_100_to_255cm'
           )

set.seed(24010817)

lstContMonth <- lapply(vCont, 
                       FUN=function(x) {
                           runFullRF(dfTrain=dfTrain %>% mutate(doy=yday(date)), 
                                     yVar=x, 
                                     xVars=c("month"), 
                                     dfTest=dfTest %>% mutate(doy=yday(date)), 
                                     isContVar = TRUE, 
                                     useLabel=paste0("predictions on 2022 holdout data for variable ", x),
                                     makePlots=FALSE,
                                     returnData=TRUE
                                     )
                          }
                      )
## 
## R-squared of predictions on 2022 holdout data for variable temperature_2m is: 76.59% (RMSE 4.98 vs. 10.3 null)
## 
## R-squared of predictions on 2022 holdout data for variable relativehumidity_2m is: 0.502% (RMSE 18.43 vs. 18.47 null)
## 
## R-squared of predictions on 2022 holdout data for variable dewpoint_2m is: 65.115% (RMSE 6.23 vs. 10.54 null)
## 
## R-squared of predictions on 2022 holdout data for variable apparent_temperature is: 77.426% (RMSE 6.1 vs. 12.84 null)
## 
## R-squared of predictions on 2022 holdout data for variable pressure_msl is: 4.473% (RMSE 7.61 vs. 7.79 null)
## 
## R-squared of predictions on 2022 holdout data for variable surface_pressure is: 3.663% (RMSE 7.56 vs. 7.7 null)
## 
## R-squared of predictions on 2022 holdout data for variable precipitation is: -0.394% (RMSE 0.46 vs. 0.46 null)
## 
## R-squared of predictions on 2022 holdout data for variable rain is: -0.435% (RMSE 0.45 vs. 0.45 null)
## 
## R-squared of predictions on 2022 holdout data for variable snowfall is: -0.778% (RMSE 0.06 vs. 0.06 null)
## 
## R-squared of predictions on 2022 holdout data for variable cloudcover is: 0.182% (RMSE 37.64 vs. 37.67 null)
## 
## R-squared of predictions on 2022 holdout data for variable cloudcover_low is: 1.728% (RMSE 34.95 vs. 35.26 null)
## 
## R-squared of predictions on 2022 holdout data for variable cloudcover_mid is: 0.063% (RMSE 35.52 vs. 35.53 null)
## 
## R-squared of predictions on 2022 holdout data for variable cloudcover_high is: -0.22% (RMSE 42.23 vs. 42.18 null)
## 
## R-squared of predictions on 2022 holdout data for variable shortwave_radiation is: 6.519% (RMSE 241.87 vs. 250.16 null)
## 
## R-squared of predictions on 2022 holdout data for variable direct_radiation is: 4.901% (RMSE 193.81 vs. 198.74 null)
## 
## R-squared of predictions on 2022 holdout data for variable direct_normal_irradiance is: 1.106% (RMSE 302.77 vs. 304.46 null)
## 
## R-squared of predictions on 2022 holdout data for variable diffuse_radiation is: 6.744% (RMSE 72.88 vs. 75.47 null)
## 
## R-squared of predictions on 2022 holdout data for variable windspeed_10m is: 5.376% (RMSE 5.68 vs. 5.84 null)
## 
## R-squared of predictions on 2022 holdout data for variable windspeed_100m is: 9.601% (RMSE 8.95 vs. 9.41 null)
## 
## R-squared of predictions on 2022 holdout data for variable winddirection_10m is: -12.253% (RMSE 92.66 vs. 87.46 null)
## 
## R-squared of predictions on 2022 holdout data for variable winddirection_100m is: -14.688% (RMSE 93.13 vs. 86.97 null)
## 
## R-squared of predictions on 2022 holdout data for variable windgusts_10m is: 3.768% (RMSE 10.85 vs. 11.07 null)
## 
## R-squared of predictions on 2022 holdout data for variable et0_fao_evapotranspiration is: 14.677% (RMSE 0.14 vs. 0.16 null)
## 
## R-squared of predictions on 2022 holdout data for variable vapor_pressure_deficit is: 33.99% (RMSE 0.5 vs. 0.61 null)
## 
## R-squared of predictions on 2022 holdout data for variable soil_temperature_0_to_7cm is: 80.634% (RMSE 4.67 vs. 10.61 null)
## 
## R-squared of predictions on 2022 holdout data for variable soil_temperature_7_to_28cm is: 90.538% (RMSE 2.89 vs. 9.39 null)
## 
## R-squared of predictions on 2022 holdout data for variable soil_temperature_28_to_100cm is: 94.093% (RMSE 1.86 vs. 7.67 null)
## 
## R-squared of predictions on 2022 holdout data for variable soil_temperature_100_to_255cm is: 96.474% (RMSE 1.1 vs. 5.84 null)
## 
## R-squared of predictions on 2022 holdout data for variable soil_moisture_0_to_7cm is: 58.669% (RMSE 0.07 vs. 0.1 null)
## 
## R-squared of predictions on 2022 holdout data for variable soil_moisture_7_to_28cm is: 63.769% (RMSE 0.05 vs. 0.08 null)
## 
## R-squared of predictions on 2022 holdout data for variable soil_moisture_28_to_100cm is: 80.704% (RMSE 0.03 vs. 0.06 null)
## 
## R-squared of predictions on 2022 holdout data for variable soil_moisture_100_to_255cm is: 83.916% (RMSE 0.01 vs. 0.02 null)
names(lstContMonth) <- vCont

Predictive power on the holdout 2022 data is explored:

sapply(lstContMonth, FUN=function(x) x[["rfAcc"]]["r2"]) %>% 
    as.data.frame() %>% 
    purrr::set_names("r2") %>% 
    rownames_to_column("metric") %>% 
    tibble::as_tibble() %>% 
    mutate(metric=str_replace(metric, pattern=".r2", replacement="")) %>% 
    arrange(r2) %>% 
    ggplot(aes(x=fct_reorder(metric, r2), y=r2)) + 
    geom_col(fill="lightblue") + 
    geom_text(aes(y=r2/2, label=round(r2, 2)), size=2.5) + 
    coord_flip() + 
    labs(x=NULL, 
         y="R2", 
         title="R-squared on holdout data", 
         subtitle="(random forest using only month as predictor)"
         )

Temperature, dewpoint, and soil moisture/temperature are best explained when using only month as a predictor

The model is run for each continuous variable, using only day of year as a predictor:

set.seed(24011017)

lstContDoY <- lapply(vCont, 
                     FUN=function(x) {
                         runFullRF(dfTrain=dfTrain %>% mutate(doy=yday(date)), 
                                   yVar=x, 
                                   xVars=c("doy"), 
                                   dfTest=dfTest %>% mutate(doy=yday(date)), 
                                   isContVar = TRUE, 
                                   useLabel=paste0("predictions on 2022 holdout data for variable ", x),
                                   makePlots=FALSE,
                                   returnData=TRUE
                                   )
                         }
                     )
## 
## R-squared of predictions on 2022 holdout data for variable temperature_2m is: 78.045% (RMSE 4.83 vs. 10.3 null)
## 
## R-squared of predictions on 2022 holdout data for variable relativehumidity_2m is: -3.319% (RMSE 18.78 vs. 18.47 null)
## 
## R-squared of predictions on 2022 holdout data for variable dewpoint_2m is: 65.464% (RMSE 6.2 vs. 10.54 null)
## 
## R-squared of predictions on 2022 holdout data for variable apparent_temperature is: 78.882% (RMSE 5.9 vs. 12.84 null)
## 
## R-squared of predictions on 2022 holdout data for variable pressure_msl is: -3.404% (RMSE 7.92 vs. 7.79 null)
## 
## R-squared of predictions on 2022 holdout data for variable surface_pressure is: -4.152% (RMSE 7.86 vs. 7.7 null)
## 
## R-squared of predictions on 2022 holdout data for variable precipitation is: -3.214% (RMSE 0.47 vs. 0.46 null)
## 
## R-squared of predictions on 2022 holdout data for variable rain is: -2.496% (RMSE 0.46 vs. 0.45 null)
## 
## R-squared of predictions on 2022 holdout data for variable snowfall is: -9.479% (RMSE 0.06 vs. 0.06 null)
## 
## R-squared of predictions on 2022 holdout data for variable cloudcover is: -4.219% (RMSE 38.46 vs. 37.67 null)
## 
## R-squared of predictions on 2022 holdout data for variable cloudcover_low is: -2.546% (RMSE 35.7 vs. 35.26 null)
## 
## R-squared of predictions on 2022 holdout data for variable cloudcover_mid is: -4.065% (RMSE 36.25 vs. 35.53 null)
## 
## R-squared of predictions on 2022 holdout data for variable cloudcover_high is: -5.892% (RMSE 43.41 vs. 42.18 null)
## 
## R-squared of predictions on 2022 holdout data for variable shortwave_radiation is: 6.193% (RMSE 242.29 vs. 250.16 null)
## 
## R-squared of predictions on 2022 holdout data for variable direct_radiation is: 4.142% (RMSE 194.58 vs. 198.74 null)
## 
## R-squared of predictions on 2022 holdout data for variable direct_normal_irradiance is: 0.066% (RMSE 304.36 vs. 304.46 null)
## 
## R-squared of predictions on 2022 holdout data for variable diffuse_radiation is: 6.251% (RMSE 73.08 vs. 75.47 null)
## 
## R-squared of predictions on 2022 holdout data for variable windspeed_10m is: 0.499% (RMSE 5.82 vs. 5.84 null)
## 
## R-squared of predictions on 2022 holdout data for variable windspeed_100m is: 3.971% (RMSE 9.22 vs. 9.41 null)
## 
## R-squared of predictions on 2022 holdout data for variable winddirection_10m is: -15.196% (RMSE 93.87 vs. 87.46 null)
## 
## R-squared of predictions on 2022 holdout data for variable winddirection_100m is: -18.047% (RMSE 94.49 vs. 86.97 null)
## 
## R-squared of predictions on 2022 holdout data for variable windgusts_10m is: -1.116% (RMSE 11.13 vs. 11.07 null)
## 
## R-squared of predictions on 2022 holdout data for variable et0_fao_evapotranspiration is: 14.467% (RMSE 0.14 vs. 0.16 null)
## 
## R-squared of predictions on 2022 holdout data for variable vapor_pressure_deficit is: 33.687% (RMSE 0.5 vs. 0.61 null)
## 
## R-squared of predictions on 2022 holdout data for variable soil_temperature_0_to_7cm is: 82.698% (RMSE 4.41 vs. 10.61 null)
## 
## R-squared of predictions on 2022 holdout data for variable soil_temperature_7_to_28cm is: 93.134% (RMSE 2.46 vs. 9.39 null)
## 
## R-squared of predictions on 2022 holdout data for variable soil_temperature_28_to_100cm is: 97.229% (RMSE 1.28 vs. 7.67 null)
## 
## R-squared of predictions on 2022 holdout data for variable soil_temperature_100_to_255cm is: 98.806% (RMSE 0.64 vs. 5.84 null)
## 
## R-squared of predictions on 2022 holdout data for variable soil_moisture_0_to_7cm is: 58.758% (RMSE 0.07 vs. 0.1 null)
## 
## R-squared of predictions on 2022 holdout data for variable soil_moisture_7_to_28cm is: 63.227% (RMSE 0.05 vs. 0.08 null)
## 
## R-squared of predictions on 2022 holdout data for variable soil_moisture_28_to_100cm is: 82.1% (RMSE 0.03 vs. 0.06 null)
## 
## R-squared of predictions on 2022 holdout data for variable soil_moisture_100_to_255cm is: 85.044% (RMSE 0.01 vs. 0.02 null)
names(lstContDoY) <- vCont

Predictive power on the holdout 2022 data is explored:

sapply(lstContMonth, FUN=function(x) x[["rfAcc"]]["r2"]) %>% 
    as.data.frame() %>% 
    purrr::set_names("r2") %>% 
    rownames_to_column("metric") %>% 
    tibble::as_tibble() %>% 
    mutate(metric=str_replace(metric, pattern=".r2", replacement=""), preds="month") %>%
    bind_rows(sapply(lstContDoY, FUN=function(x) x[["rfAcc"]]["r2"]) %>% 
                  as.data.frame() %>% 
                  purrr::set_names("r2") %>% 
                  rownames_to_column("metric") %>% 
                  tibble::as_tibble() %>% 
                  mutate(metric=str_replace(metric, pattern=".r2", replacement=""), preds="doy")
              ) %>%
    ggplot(aes(x=fct_reorder(metric, r2), y=r2)) + 
    geom_col(fill="lightblue") + 
    geom_text(aes(y=r2/2, label=round(r2, 2)), size=2.5) + 
    coord_flip() + 
    facet_wrap(~preds) +
    labs(x=NULL, 
         y="R2", 
         title="R-squared on holdout data", 
         subtitle="(random forest using only month or day-of-year as predictor)"
         )

The variables have similar predictive power, though day-of-year is slightly more useful for consistently seasonal, gradual change metrics (e.g., soil moisture/temperature) while being more prone to overfit on metrics that have small (if any) seasonality

The model is run for each continuous variable, using only hour of day (as both factor and integer) as a predictor:

set.seed(24011017)

lstContToD <- lapply(vCont, 
                     FUN=function(x) {
                         runFullRF(dfTrain=dfTrain %>% mutate(doy=yday(date)), 
                                   yVar=x, 
                                   xVars=c("hour", "fct_hour"), 
                                   dfTest=dfTest %>% mutate(doy=yday(date)), 
                                   isContVar = TRUE, 
                                   useLabel=paste0("predictions on 2022 holdout data for variable ", x),
                                   makePlots=FALSE,
                                   returnData=TRUE
                                   )
                         }
                     )
## 
## R-squared of predictions on 2022 holdout data for variable temperature_2m is: 4.854% (RMSE 10.05 vs. 10.3 null)
## 
## R-squared of predictions on 2022 holdout data for variable relativehumidity_2m is: 23.331% (RMSE 16.18 vs. 18.47 null)
## 
## R-squared of predictions on 2022 holdout data for variable dewpoint_2m is: -0.022% (RMSE 10.54 vs. 10.54 null)
## 
## R-squared of predictions on 2022 holdout data for variable apparent_temperature is: 3.091% (RMSE 12.64 vs. 12.84 null)
## 
## R-squared of predictions on 2022 holdout data for variable pressure_msl is: -0.207% (RMSE 7.8 vs. 7.79 null)
## 
## R-squared of predictions on 2022 holdout data for variable surface_pressure is: -0.252% (RMSE 7.71 vs. 7.7 null)
## 
## R-squared of predictions on 2022 holdout data for variable precipitation is: -0.16% (RMSE 0.46 vs. 0.46 null)
## 
## R-squared of predictions on 2022 holdout data for variable rain is: -0.149% (RMSE 0.45 vs. 0.45 null)
## 
## R-squared of predictions on 2022 holdout data for variable snowfall is: -0.234% (RMSE 0.06 vs. 0.06 null)
## 
## R-squared of predictions on 2022 holdout data for variable cloudcover is: 0.051% (RMSE 37.66 vs. 37.67 null)
## 
## R-squared of predictions on 2022 holdout data for variable cloudcover_low is: 0.385% (RMSE 35.19 vs. 35.26 null)
## 
## R-squared of predictions on 2022 holdout data for variable cloudcover_mid is: 0.107% (RMSE 35.51 vs. 35.53 null)
## 
## R-squared of predictions on 2022 holdout data for variable cloudcover_high is: -0.111% (RMSE 42.21 vs. 42.18 null)
## 
## R-squared of predictions on 2022 holdout data for variable shortwave_radiation is: 69.204% (RMSE 138.82 vs. 250.16 null)
## 
## R-squared of predictions on 2022 holdout data for variable direct_radiation is: 54.254% (RMSE 134.42 vs. 198.74 null)
## 
## R-squared of predictions on 2022 holdout data for variable direct_normal_irradiance is: 53.312% (RMSE 208.03 vs. 304.46 null)
## 
## R-squared of predictions on 2022 holdout data for variable diffuse_radiation is: 67.325% (RMSE 43.14 vs. 75.47 null)
## 
## R-squared of predictions on 2022 holdout data for variable windspeed_10m is: 2.213% (RMSE 5.77 vs. 5.84 null)
## 
## R-squared of predictions on 2022 holdout data for variable windspeed_100m is: 2.731% (RMSE 9.28 vs. 9.41 null)
## 
## R-squared of predictions on 2022 holdout data for variable winddirection_10m is: -10.089% (RMSE 91.76 vs. 87.46 null)
## 
## R-squared of predictions on 2022 holdout data for variable winddirection_100m is: -11.528% (RMSE 91.84 vs. 86.97 null)
## 
## R-squared of predictions on 2022 holdout data for variable windgusts_10m is: 2.201% (RMSE 10.94 vs. 11.07 null)
## 
## R-squared of predictions on 2022 holdout data for variable et0_fao_evapotranspiration is: 53.956% (RMSE 0.11 vs. 0.16 null)
## 
## R-squared of predictions on 2022 holdout data for variable vapor_pressure_deficit is: 21.069% (RMSE 0.54 vs. 0.61 null)
## 
## R-squared of predictions on 2022 holdout data for variable soil_temperature_0_to_7cm is: 6.285% (RMSE 10.27 vs. 10.61 null)
## 
## R-squared of predictions on 2022 holdout data for variable soil_temperature_7_to_28cm is: 0.141% (RMSE 9.38 vs. 9.39 null)
## 
## R-squared of predictions on 2022 holdout data for variable soil_temperature_28_to_100cm is: -0.405% (RMSE 7.69 vs. 7.67 null)
## 
## R-squared of predictions on 2022 holdout data for variable soil_temperature_100_to_255cm is: -0.881% (RMSE 5.87 vs. 5.84 null)
## 
## R-squared of predictions on 2022 holdout data for variable soil_moisture_0_to_7cm is: -1.298% (RMSE 0.1 vs. 0.1 null)
## 
## R-squared of predictions on 2022 holdout data for variable soil_moisture_7_to_28cm is: -1.715% (RMSE 0.08 vs. 0.08 null)
## 
## R-squared of predictions on 2022 holdout data for variable soil_moisture_28_to_100cm is: -0.644% (RMSE 0.06 vs. 0.06 null)
## 
## R-squared of predictions on 2022 holdout data for variable soil_moisture_100_to_255cm is: -0.011% (RMSE 0.02 vs. 0.02 null)
names(lstContToD) <- vCont

Predictive power on the holdout 2022 data is explored:

sapply(lstContToD, FUN=function(x) x[["rfAcc"]]["r2"]) %>% 
    as.data.frame() %>% 
    purrr::set_names("r2") %>% 
    rownames_to_column("metric") %>% 
    tibble::as_tibble() %>% 
    mutate(metric=str_replace(metric, pattern=".r2", replacement="")) %>% 
    arrange(r2) %>% 
    ggplot(aes(x=fct_reorder(metric, r2), y=r2)) + 
    geom_col(fill="lightblue") + 
    geom_text(aes(y=r2/2, label=round(r2, 2)), size=2.5) + 
    coord_flip() + 
    labs(x=NULL, 
         y="R2", 
         title="R-squared on holdout data", 
         subtitle="(random forest using only hour of day as predictor)"
         )

As expected, hour of day is a moderately strong predictor for radiation and associated metrics

The model is run for each continuous variable, using hour of day (as both factor and integer), month, and day of year as predictors:

set.seed(24011414)

lstContHDM <- lapply(vCont, 
                     FUN=function(x) {
                         runFullRF(dfTrain=dfTrain %>% mutate(doy=yday(date)), 
                                   yVar=x, 
                                   xVars=c("hour", "fct_hour", "month", "doy"), 
                                   dfTest=dfTest %>% mutate(doy=yday(date)), 
                                   isContVar = TRUE, 
                                   useLabel=paste0("predictions on 2022 holdout data for variable ", x),
                                   makePlots=FALSE,
                                   returnData=TRUE
                                   )
                         }
                     )
## 
## R-squared of predictions on 2022 holdout data for variable temperature_2m is: 83.21% (RMSE 4.22 vs. 10.3 null)
## 
## R-squared of predictions on 2022 holdout data for variable relativehumidity_2m is: 26.309% (RMSE 15.86 vs. 18.47 null)
## 
## R-squared of predictions on 2022 holdout data for variable dewpoint_2m is: 65.982% (RMSE 6.15 vs. 10.54 null)
## 
## R-squared of predictions on 2022 holdout data for variable apparent_temperature is: 82.153% (RMSE 5.42 vs. 12.84 null)
## 
## R-squared of predictions on 2022 holdout data for variable pressure_msl is: -0.529% (RMSE 7.81 vs. 7.79 null)
## 
## R-squared of predictions on 2022 holdout data for variable surface_pressure is: -1.503% (RMSE 7.76 vs. 7.7 null)
## 
## R-squared of predictions on 2022 holdout data for variable precipitation is: -3.28% (RMSE 0.47 vs. 0.46 null)
## 
## R-squared of predictions on 2022 holdout data for variable rain is: -3.199% (RMSE 0.46 vs. 0.45 null)
## 
## R-squared of predictions on 2022 holdout data for variable snowfall is: -15.832% (RMSE 0.06 vs. 0.06 null)
## 
## R-squared of predictions on 2022 holdout data for variable cloudcover is: -1.174% (RMSE 37.89 vs. 37.67 null)
## 
## R-squared of predictions on 2022 holdout data for variable cloudcover_low is: 0.019% (RMSE 35.25 vs. 35.26 null)
## 
## R-squared of predictions on 2022 holdout data for variable cloudcover_mid is: -2.049% (RMSE 35.9 vs. 35.53 null)
## 
## R-squared of predictions on 2022 holdout data for variable cloudcover_high is: -4.107% (RMSE 43.04 vs. 42.18 null)
## 
## R-squared of predictions on 2022 holdout data for variable shortwave_radiation is: 82.679% (RMSE 104.11 vs. 250.16 null)
## 
## R-squared of predictions on 2022 holdout data for variable direct_radiation is: 65.042% (RMSE 117.51 vs. 198.74 null)
## 
## R-squared of predictions on 2022 holdout data for variable direct_normal_irradiance is: 58.024% (RMSE 197.26 vs. 304.46 null)
## 
## R-squared of predictions on 2022 holdout data for variable diffuse_radiation is: 79.995% (RMSE 33.76 vs. 75.47 null)
## 
## R-squared of predictions on 2022 holdout data for variable windspeed_10m is: 5.367% (RMSE 5.68 vs. 5.84 null)
## 
## R-squared of predictions on 2022 holdout data for variable windspeed_100m is: 8.606% (RMSE 9 vs. 9.41 null)
## 
## R-squared of predictions on 2022 holdout data for variable winddirection_10m is: -15.946% (RMSE 94.17 vs. 87.46 null)
## 
## R-squared of predictions on 2022 holdout data for variable winddirection_100m is: -17.85% (RMSE 94.41 vs. 86.97 null)
## 
## R-squared of predictions on 2022 holdout data for variable windgusts_10m is: 3.551% (RMSE 10.87 vs. 11.07 null)
## 
## R-squared of predictions on 2022 holdout data for variable et0_fao_evapotranspiration is: 84.039% (RMSE 0.06 vs. 0.16 null)
## 
## R-squared of predictions on 2022 holdout data for variable vapor_pressure_deficit is: 68.147% (RMSE 0.35 vs. 0.61 null)
## 
## R-squared of predictions on 2022 holdout data for variable soil_temperature_0_to_7cm is: 90.177% (RMSE 3.32 vs. 10.61 null)
## 
## R-squared of predictions on 2022 holdout data for variable soil_temperature_7_to_28cm is: 93.446% (RMSE 2.4 vs. 9.39 null)
## 
## R-squared of predictions on 2022 holdout data for variable soil_temperature_28_to_100cm is: 97.111% (RMSE 1.3 vs. 7.67 null)
## 
## R-squared of predictions on 2022 holdout data for variable soil_temperature_100_to_255cm is: 98.746% (RMSE 0.65 vs. 5.84 null)
## 
## R-squared of predictions on 2022 holdout data for variable soil_moisture_0_to_7cm is: 58.529% (RMSE 0.07 vs. 0.1 null)
## 
## R-squared of predictions on 2022 holdout data for variable soil_moisture_7_to_28cm is: 62.84% (RMSE 0.05 vs. 0.08 null)
## 
## R-squared of predictions on 2022 holdout data for variable soil_moisture_28_to_100cm is: 81.509% (RMSE 0.03 vs. 0.06 null)
## 
## R-squared of predictions on 2022 holdout data for variable soil_moisture_100_to_255cm is: 84.46% (RMSE 0.01 vs. 0.02 null)
names(lstContHDM) <- vCont

Predictive power on the holdout 2022 data is explored:

tmpFun <- function(x) {
    sapply(get(x), FUN=function(y) y[["rfAcc"]]["r2"]) %>% 
        as.data.frame() %>% 
        purrr::set_names("r2") %>% 
        rownames_to_column("metric") %>% 
        tibble::as_tibble() %>% 
        mutate(metric=str_replace(metric, pattern=".r2", replacement=""), preds=x)
}

map_dfr(c("lstContDoY", "lstContHDM", "lstContMonth", "lstContToD"), .f=tmpFun) %>% 
    mutate(predLab=c("lstContDoY"="Day of Year", 
                     "lstContHDM"="All", 
                     "lstContMonth"="Month", 
                     "lstContToD"="Hour of Day"
                     )[preds]
           ) %>%
    ggplot(aes(x=fct_reorder(metric, r2, max), y=r2)) + 
    geom_point(aes(color=predLab)) + 
    coord_flip() +
    labs(x=NULL, 
         y="R2", 
         title="R-squared on holdout data", 
         subtitle="(random forest using combinations of hour, day of year, month as predictor)"
         ) + 
    geom_hline(yintercept=0, lty=2) +
    scale_color_discrete(NULL)

As expected, the random forest using all three variables generally performs similar to the best performing standalone metric

R-squared is compared between holdout and OOB training data:

tmpFun_v2 <- function(x) {
    sapply(get(x), FUN=function(y) c("rfTrain"=y[["rf"]][["r.squared"]], "rfTest"=y[["rfAcc"]][["r2"]])) %>% 
        t() %>% 
        as.data.frame() %>% 
        rownames_to_column("metric") %>% 
        tibble::as_tibble() %>% 
        pivot_longer(cols=-c(metric)) %>% 
        mutate(preds=x)
}

map_dfr(c("lstContDoY", "lstContHDM", "lstContMonth", "lstContToD"), .f=tmpFun_v2) %>% 
    mutate(predLab=c("lstContDoY"="Day of Year", 
                     "lstContHDM"="All", 
                     "lstContMonth"="Month", 
                     "lstContToD"="Hour of Day"
                     )[preds], 
           nameLab=c("rfTest"="Holdout R2", "rfTrain"="OOB R2 Estimate")[name]
           ) %>%
    ggplot(aes(x=fct_reorder(metric, value, max), y=value)) + 
    geom_point(aes(color=nameLab)) + 
    coord_flip() +
    labs(x=NULL, 
         y=NULL, 
         title="R-squared on holdout data vs. OOB training estimate", 
         subtitle="(random forest using combinations of hour, day of year, month as predictor)"
         ) + 
    geom_hline(yintercept=c(0, 1), lty=2) +
    facet_wrap(~predLab, nrow=1) +
    scale_color_discrete(NULL) + 
    theme(legend.position="bottom")

R-squared is compared between holdout and OOB training data, for the full model and the best single-variable model:

map_dfr(c("lstContDoY", "lstContHDM", "lstContMonth", "lstContToD"), .f=tmpFun_v2) %>% 
    mutate(predLab=c("lstContDoY"="Day of Year", 
                     "lstContHDM"="All", 
                     "lstContMonth"="Month", 
                     "lstContToD"="Hour of Day"
    )[preds], 
    nameLab=c("rfTest"="Holdout R2", "rfTrain"="OOB R2 Estimate")[name]
    ) %>% 
    group_by(metric, name, nameLab) %>% 
    summarize(maxAll=max(ifelse(predLab=="All", value, NA), na.rm=TRUE), 
              maxSingle=max(ifelse(predLab=="All", NA, value), na.rm=TRUE),
              .groups="drop"
              ) %>% 
    pivot_longer(cols=starts_with("max"), names_to="varType") %>% 
    ggplot(aes(x=fct_reorder(metric, value), y=value)) + 
    geom_point(aes(color=c("maxAll"="All", "maxSingle"="Best\nSingle")[varType])) + 
    facet_wrap(~nameLab) + 
    coord_flip() + 
    labs(x=NULL, 
         y="R-squared", 
         title="R-squared using best single predictor or all-three predictors", 
         subtitle="(random forest using combinations of hour, day of year, month as predictor)"
         ) + 
    geom_hline(yintercept=c(0, 1), lty=2) +
    scale_color_discrete(NULL)

Using several variables is especially helpful for predicting radiation and related variables (vapor pressure deficit, evapotranspiration) that are highly associated with sunny days in the warm season

A random forest is run for each variable using percentiles of all variables other than itself:

set.seed(24011816)

lstContNum <- lapply(vCont, 
                     FUN=function(x) {
                         runFullRF(dfTrain=dfTrain %>% mutate(doy=yday(date)), 
                                   yVar=x, 
                                   xVars=paste0("pct_", vCont[vCont!=x]), 
                                   dfTest=dfTest %>% mutate(doy=yday(date)), 
                                   isContVar = TRUE, 
                                   useLabel=paste0("predictions on 2022 holdout data for variable ", x),
                                   makePlots=FALSE,
                                   returnData=TRUE
                                   )
                         }
                     )
## 
## R-squared of predictions on 2022 holdout data for variable temperature_2m is: 99.829% (RMSE 0.43 vs. 10.3 null)
## 
## R-squared of predictions on 2022 holdout data for variable relativehumidity_2m is: 97.748% (RMSE 2.77 vs. 18.47 null)
## 
## R-squared of predictions on 2022 holdout data for variable dewpoint_2m is: 99.479% (RMSE 0.76 vs. 10.54 null)
## 
## R-squared of predictions on 2022 holdout data for variable apparent_temperature is: 99.794% (RMSE 0.58 vs. 12.84 null)
## 
## R-squared of predictions on 2022 holdout data for variable pressure_msl is: 96.117% (RMSE 1.54 vs. 7.79 null)
## 
## R-squared of predictions on 2022 holdout data for variable surface_pressure is: 96.285% (RMSE 1.48 vs. 7.7 null)
## 
## R-squared of predictions on 2022 holdout data for variable precipitation is: 95.37% (RMSE 0.1 vs. 0.46 null)
## 
## R-squared of predictions on 2022 holdout data for variable rain is: 95.338% (RMSE 0.1 vs. 0.45 null)
## 
## R-squared of predictions on 2022 holdout data for variable snowfall is: 83.756% (RMSE 0.02 vs. 0.06 null)
## 
## R-squared of predictions on 2022 holdout data for variable cloudcover is: 98.577% (RMSE 4.49 vs. 37.67 null)
## 
## R-squared of predictions on 2022 holdout data for variable cloudcover_low is: 90.045% (RMSE 11.12 vs. 35.26 null)
## 
## R-squared of predictions on 2022 holdout data for variable cloudcover_mid is: 84.218% (RMSE 14.12 vs. 35.53 null)
## 
## R-squared of predictions on 2022 holdout data for variable cloudcover_high is: 66.136% (RMSE 24.55 vs. 42.18 null)
## 
## R-squared of predictions on 2022 holdout data for variable shortwave_radiation is: 99.798% (RMSE 11.23 vs. 250.16 null)
## 
## R-squared of predictions on 2022 holdout data for variable direct_radiation is: 99.796% (RMSE 8.98 vs. 198.74 null)
## 
## R-squared of predictions on 2022 holdout data for variable direct_normal_irradiance is: 99.42% (RMSE 23.18 vs. 304.46 null)
## 
## R-squared of predictions on 2022 holdout data for variable diffuse_radiation is: 96.79% (RMSE 13.52 vs. 75.47 null)
## 
## R-squared of predictions on 2022 holdout data for variable windspeed_10m is: 95.421% (RMSE 1.25 vs. 5.84 null)
## 
## R-squared of predictions on 2022 holdout data for variable windspeed_100m is: 94.495% (RMSE 2.21 vs. 9.41 null)
## 
## R-squared of predictions on 2022 holdout data for variable winddirection_10m is: 78.832% (RMSE 40.24 vs. 87.46 null)
## 
## R-squared of predictions on 2022 holdout data for variable winddirection_100m is: 78.689% (RMSE 40.15 vs. 86.97 null)
## 
## R-squared of predictions on 2022 holdout data for variable windgusts_10m is: 87.944% (RMSE 3.84 vs. 11.07 null)
## Growing trees.. Progress: 88%. Estimated remaining time: 4 seconds.
## 
## R-squared of predictions on 2022 holdout data for variable et0_fao_evapotranspiration is: 99.76% (RMSE 0.01 vs. 0.16 null)
## Growing trees.. Progress: 97%. Estimated remaining time: 0 seconds.
## 
## R-squared of predictions on 2022 holdout data for variable vapor_pressure_deficit is: 99.151% (RMSE 0.06 vs. 0.61 null)
## 
## R-squared of predictions on 2022 holdout data for variable soil_temperature_0_to_7cm is: 99.353% (RMSE 0.85 vs. 10.61 null)
## 
## R-squared of predictions on 2022 holdout data for variable soil_temperature_7_to_28cm is: 99.077% (RMSE 0.9 vs. 9.39 null)
## 
## R-squared of predictions on 2022 holdout data for variable soil_temperature_28_to_100cm is: 98.415% (RMSE 0.97 vs. 7.67 null)
## 
## R-squared of predictions on 2022 holdout data for variable soil_temperature_100_to_255cm is: 93.574% (RMSE 1.48 vs. 5.84 null)
## Growing trees.. Progress: 80%. Estimated remaining time: 7 seconds.
## 
## R-squared of predictions on 2022 holdout data for variable soil_moisture_0_to_7cm is: 91.732% (RMSE 0.03 vs. 0.1 null)
## 
## R-squared of predictions on 2022 holdout data for variable soil_moisture_7_to_28cm is: 92.973% (RMSE 0.02 vs. 0.08 null)
## 
## R-squared of predictions on 2022 holdout data for variable soil_moisture_28_to_100cm is: 91.72% (RMSE 0.02 vs. 0.06 null)
## 
## R-squared of predictions on 2022 holdout data for variable soil_moisture_100_to_255cm is: 80.483% (RMSE 0.01 vs. 0.02 null)
names(lstContNum) <- vCont

Predictive power on the holdout 2022 data is explored:

sapply(lstContNum, FUN=function(x) x[["rfAcc"]]["r2"]) %>% 
    as.data.frame() %>% 
    purrr::set_names("r2") %>% 
    rownames_to_column("metric") %>% 
    tibble::as_tibble() %>% 
    mutate(metric=str_replace(metric, pattern=".r2", replacement="")) %>% 
    arrange(r2) %>% 
    ggplot(aes(x=fct_reorder(metric, r2), y=r2)) + 
    geom_col(fill="lightblue") + 
    geom_text(aes(y=r2/2, label=round(r2, 2)), size=2.5) + 
    coord_flip() + 
    labs(x=NULL, 
         y="R2", 
         title="R-squared on holdout data", 
         subtitle="(random forest using all variables other than self as predictors)"
         )

The variables have multiple, meaningful associations with each other, so combined predictive power is very high

The most important explanatory variables are explored:

dfImp <- map_dfr(lstContNum, .f=function(x) x$rfImp, .id="src") %>% 
    arrange(src, -imp) %>% 
    group_by(src) %>%
    mutate(pct=imp/sum(imp), cspct=cumsum(pct), n=row_number()) 

dfImp %>%
    ggplot(aes(x=n, y=cspct)) + 
    geom_line(aes(group=src), alpha=0.5) + 
    labs(x="# Explanatory Variables", 
         y="Cumulative Percent of Variable Importance", 
         title="Variable Importance vs. # Explanatory Variables", 
         subtitle="(random forest using all variables other than self as predictors)"
         )

dfImp %>%
    filter(n==4) %>%
    ggplot(aes(x=fct_reorder(src, cspct), y=cspct)) + 
    geom_col(fill="lightblue") +
    geom_text(aes(label=round(cspct, 2), y=cspct/2), size=3) +
    coord_flip() +
    labs(x=NULL, 
         y="Cumulative Percent of Variable Importance (top 4 variables)", 
         title="Variable Importance Explained by Top 4 Explanatory Variables", 
         subtitle="(random forest using all variables other than self as predictors)"
         )

The overlap of predictors is explored:

varOrder <- dfImp %>% 
    filter(n<=4) %>% 
    ungroup() %>% 
    count(metric, sort=TRUE) %>%
    pull(metric) %>%
    str_replace(pattern="pct_", replacement="")
varOrder <- c(varOrder, setdiff(dfImp %>% count(src) %>% pull(src), varOrder))
varOrder
##  [1] "soil_temperature_7_to_28cm"    "et0_fao_evapotranspiration"   
##  [3] "cloudcover_mid"                "soil_temperature_0_to_7cm"    
##  [5] "temperature_2m"                "dewpoint_2m"                  
##  [7] "apparent_temperature"          "cloudcover"                   
##  [9] "cloudcover_low"                "direct_radiation"             
## [11] "precipitation"                 "soil_temperature_28_to_100cm" 
## [13] "diffuse_radiation"             "relativehumidity_2m"          
## [15] "shortwave_radiation"           "soil_moisture_0_to_7cm"       
## [17] "soil_moisture_28_to_100cm"     "surface_pressure"             
## [19] "windspeed_100m"                "direct_normal_irradiance"     
## [21] "rain"                          "soil_temperature_100_to_255cm"
## [23] "cloudcover_high"               "pressure_msl"                 
## [25] "soil_moisture_100_to_255cm"    "soil_moisture_7_to_28cm"      
## [27] "vapor_pressure_deficit"        "winddirection_100m"           
## [29] "windgusts_10m"                 "windspeed_10m"                
## [31] "winddirection_10m"             "snowfall"
dfImp %>%
    filter(n<=4) %>%
    mutate(predictor=str_replace(metric, pattern="pct_", replacement="")) %>%
    ggplot(aes(x=factor(src, levels=varOrder), y=factor(predictor, levels=rev(varOrder)))) + 
    geom_tile(aes(fill=factor(n))) +
    coord_flip() +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
    labs(x="Dependent Variable", 
         y="Explanatory Variables", 
         title="Top 4 Importance Explanatory Variables per Dependent Variable", 
         subtitle="(random forest using all variables other than self as predictors)"
         ) + 
    scale_fill_discrete(NULL)

The model for temperature_2m is run using only the best four predictors:

runFullRF(dfTrain=dfTrain %>% mutate(doy=yday(date)), 
          yVar="temperature_2m", 
          xVars=dfImp %>% filter(n<=4, src=="temperature_2m") %>% pull(metric), 
          dfTest=dfTest %>% mutate(doy=yday(date)), 
          isContVar = TRUE, 
          useLabel=keyLabel, 
          useSub=stringr::str_to_sentence(keyLabel), 
          makePlots=FALSE,
          returnData=TRUE
          )[c("rfImp", "rfAcc")]
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 99.565% (RMSE 0.68 vs. 10.3 null)
## $rfImp
## # A tibble: 4 × 2
##   metric                              imp
##   <chr>                             <dbl>
## 1 pct_apparent_temperature       4185398.
## 2 pct_soil_temperature_0_to_7cm  2576757.
## 3 pct_dewpoint_2m                 199948.
## 4 pct_soil_temperature_7_to_28cm 1141860.
## 
## $rfAcc
##     mseNull     msePred          r2 
## 106.1501502   0.4619588   0.9956481

Surface air temperature is well predicted by apparent temperature, shallow soil temperature, and dewpoint

The process is converted to functional form:

runPartialImportanceRF <- function(dfTrain, 
                                   yVar, 
                                   dfTest,
                                   impDB=dfImp,
                                   nImp=+Inf,
                                   otherX=c(),
                                   isContVar=TRUE, 
                                   useLabel=keyLabel, 
                                   useSub=stringr::str_to_sentence(keyLabel), 
                                   makePlots=FALSE, 
                                   returnElem=c("rfImp", "rfAcc")
                                   ) {
    
    # FUNCTION ARGUMENTS
    # dfTrain: training data
    # yVar: y variable in dfTrain
    # dfTest: test data
    # impDB: tibble containing variable importance by dependent variable
    # nImp: use the top nImp variables by variable importance
    # otherX: include these additional x variables
    # isContVar: boolean, is this a continuous variable (regression)? FALSE means classification
    # useLabel: label for description
    # useSub: label for plot
    # makePlots: boolean, should plots be created?
    # returnElem: character vector of list elements to be returned

    runFullRF(dfTrain=dfTrain, 
              yVar=yVar, 
              xVars=unique(c(impDB %>% filter(n<=nImp, src==yVar) %>% pull(metric), otherX)), 
              dfTest=dfTest, 
              isContVar = isContVar, 
              useLabel=useLabel, 
              useSub=useSub, 
              makePlots=makePlots,
              returnData=TRUE
              )[returnElem]
    
}

The function is tested for temperature:

runPartialImportanceRF(dfTrain=dfTrain, 
                       yVar="temperature_2m", 
                       dfTest=dfTest, 
                       impDB=dfImp,
                       nImp=4, 
                       makePlots=TRUE
                       )

## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 99.567% (RMSE 0.68 vs. 10.3 null)
## `geom_smooth()` using formula = 'y ~ x'

## $rfImp
## # A tibble: 4 × 2
##   metric                              imp
##   <chr>                             <dbl>
## 1 pct_apparent_temperature       4222317.
## 2 pct_soil_temperature_0_to_7cm  2599389.
## 3 pct_dewpoint_2m                 192700.
## 4 pct_soil_temperature_7_to_28cm 1092429.
## 
## $rfAcc
##     mseNull     msePred          r2 
## 106.1501502   0.4599354   0.9956671

The function is updated to allow for rounding:

runPartialImportanceRF <- function(dfTrain, 
                                   yVar, 
                                   dfTest,
                                   impDB=dfImp,
                                   nImp=+Inf,
                                   otherX=c(),
                                   isContVar=TRUE, 
                                   useLabel=keyLabel, 
                                   useSub=stringr::str_to_sentence(keyLabel), 
                                   rndTo=NULL,
                                   refXY=FALSE,
                                   makePlots=FALSE, 
                                   returnElem=c("rfImp", "rfAcc")
                                   ) {
    
    # FUNCTION ARGUMENTS
    # dfTrain: training data
    # yVar: y variable in dfTrain
    # dfTest: test data
    # impDB: tibble containing variable importance by dependent variable
    # nImp: use the top nImp variables by variable importance
    # otherX: include these additional x variables
    # isContVar: boolean, is this a continuous variable (regression)? FALSE means classification
    # useLabel: label for description
    # useSub: label for plot
    # rndTo: controls the rounding parameter for plots, passed to runFullRF (NULL means no rounding)
    # refXY: controls the reference line parameter for plots, passed to runFullRF
    # makePlots: boolean, should plots be created?
    # returnElem: character vector of list elements to be returned

    runFullRF(dfTrain=dfTrain, 
              yVar=yVar, 
              xVars=unique(c(impDB %>% filter(n<=nImp, src==yVar) %>% pull(metric), otherX)), 
              dfTest=dfTest, 
              isContVar = isContVar, 
              useLabel=useLabel, 
              useSub=useSub, 
              rndTo=rndTo,
              refXY=refXY,
              makePlots=makePlots,
              returnData=TRUE
              )[returnElem]
    
}